This is the formal analysis file for the “The Nature of Reasonableness” by by Kevin Tobia, Ivar R. Hannikainen, David Kamper, Guilherme Almeida, Piotr Bystranowski, Niek Strohmaier, Vilius Dranseika, Markus Kneer, Fernando Aguiar, Kristina Dolinina, Bartosz Janik, Eglė Lauraitytė, Alice Liefgreen, Maciej Próchnicki, Alejandro Rosas, Vivek Kumar Shukla, and Noel Struchiner (2025, Stanford Journal of International Law).

Introduction and Loading of Files

# Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stats)
library(ggplot2)
library(ggthemes)
library(corrplot)
## corrplot 0.95 loaded
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
DATA_PATH <- "/Users/dgkamper/Library/CloudStorage/GoogleDrive-dgkamper@gmail.com/My Drive/DGK Lab/Collaborations/Language and Law Laboratory/Analysis/LegalResonablenessAnalysis/Data_all_numeric.csv"

1. Initial Data Loading and Preparation

# Read the data from specified path
Data_All <- read.csv(DATA_PATH, stringsAsFactors = FALSE)

# Verify data structure
glimpse(Data_All)
## Rows: 2,426
## Columns: 42
## $ q1             <dbl> 2, 2, 3, 2, 2, 3, 2, 3, 2, 2, 3, 3, 4, 1, 4, 3, 3, 2, 2…
## $ q2             <dbl> 0, 1, 3, 1, 2, 15, 5, 2, 7, 2, 5, 3, 5, 7, 20, 5, 4, 2,…
## $ q3             <dbl> 8, 7, 10, 10, 3, 14, 2, 5, 8, 7, 3, 3, 5, 5, 4, 2, 7, 5…
## $ q4             <dbl> 2500, 2000, 2000, 50, 2000, 1200, 2, 1800, 2000, 1500, …
## $ q5             <dbl> 120, 30, 50, 50, 60, 30, 3, 30, 100, 60, 30, 30, 40, 30…
## $ q6             <dbl> 0, 1, 1, 0, 2, 0, 5, 0, 10, 5, 0, 7, 4, 4, 10, 10, 7, 5…
## $ q7             <chr> "20", "2", "30", "2", "15", "0", "5", "10", "15", "10",…
## $ q8             <dbl> 100, 10, 1, 20, 12, 5, 5, 4, 12, 12, 3, 4, 2, 12, 3, 10…
## $ q9             <dbl> 0, 2, 10, 1, 4, 3, 10, 3, 6, 8, 5, 10, 2, 6, 25, 7, 10,…
## $ q10            <dbl> 3, 50, 5, 5, 3, 1, 4, 1, 8, 0, 0, 10, 1, 1, 2, 0, 1, 1,…
## $ q11            <chr> "0", "1000", "50", "5", "250", "0", "7", "0", "400", "0…
## $ q12            <dbl> 0, 10, 10, 10, 5, 0, 6, 1, 20, 75, 5, 30, 30, 1, 1, 5, …
## $ q13            <dbl> 25, 1, 20, 15, 10, 10, 2, 6, 15, 100, 16, 50, 50, 4, 25…
## $ q14            <chr> "30", "5", "15", "3", "5", "10", "8", "5", "10", "5", "…
## $ q15            <dbl> 25, 30, 5, 5, 7, 25, 6, 4, 25, 4, 12, 4, 5, 4, 2, 6, 8,…
## $ q16            <dbl> 5, 30, 2, 3, 30, 25, 6, 4, 4, 5, 4, 4, 4, 4, 2, 2, 30, …
## $ q17            <chr> "3", "1", "1", "1", "1", "0", "1", "0", "2", "1", "0", …
## $ q18            <dbl> 12, 1, 5, 15, 10, 0, 1, 1, 5, 0, 0, 2, 10, 5, 15, 5, 5,…
## $ q19            <int> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,…
## $ q20            <int> 0, 75, 5, 1, 10, 0, 4, 2, 30, 0, 0, 40, 15, 1, 10, 0, 8…
## $ q21            <dbl> 0, 12, 6, 2, 10, 6, 4, 4, 6, 5, 6, 9, 6, 5, 10, 7, 6, 8…
## $ q22            <dbl> 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4…
## $ q23            <dbl> 1, 0, 2, 3, 3, 10, 6, 1, 8, 5, 1, 2, 4, 2, 1, 4, 8, 2, …
## $ q24            <dbl> 5, 2, 5, 3, 4, 48, 4, 24, 72, 24, 6, 10, 24, 10, 2, 6, …
## $ q25            <dbl> 0, 10, 1000, 5, 300, 0, 5, 1000, 20000, 1000, 500, 2000…
## $ q26            <dbl> 3, 2, 2, 2, 2, 1, 4, 2, 4, 2, 2, 4, 6, 3, 1, 2, 3, 8, 2…
## $ q27            <dbl> 0, 6, 2, 5, 6, 3, 50, 5, 25, 30, 0, 6, 5, 5, 5, 12, 6, …
## $ q28            <dbl> 5, 5, 10, 50, 20, 5, 5, 10, 25, 60, 100, 80, 5, 20, 10,…
## $ q29            <dbl> 100, 50, 2, 25, 100, 100, 5, 50, 100, 100, 100, 100, 15…
## $ q30            <dbl> 3, 4, 2, 1, 6, 1, 2, 12, 26, 5, 0, 1, 8, 2, 4, 3, 4, 2,…
## $ q31            <dbl> 20, 750, 2, 12, 25, 0, 3, 100, 10, 30, 250, 50, 50, 10,…
## $ q32            <dbl> 5, 1, 2, 48, 24, 24, 55, 24, 24, 24, 24, 48, 24, 48, 12…
## $ q33            <chr> "0", "10", "2", "25", "4", "8", "2", "4", "4", "3.2", "…
## $ q34            <dbl> 95, 1, 2, 15, 50, 0, 5, 0, 95, 85, 100, 99, 50, 30, 15,…
## $ age            <int> 18, 47, 28, 30, NA, NA, 30, 50, 47, 33, 0, 0, 30, 43, 4…
## $ Gender         <chr> "Male", "Male", "Male", "Male", "Female", "Female", "Ma…
## $ ID             <chr> "usa1", "usa2", "usa3", "usa4", "usa5", "usa6", "usa7",…
## $ Country_name   <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA",…
## $ Condition_name <chr> "Reasonable", "Reasonable", "Reasonable", "Reasonable",…
## $ ID.1           <chr> "usa1", "usa2", "usa3", "usa4", "usa5", "usa6", "usa7",…
## $ Country        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Condition      <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…

2. Exclude participants whose answer to Q19 does not equal 15 [comprehension/attention question].

# Split data by condition
Data_All_Average <- Data_All %>% filter(Condition_name == "Average")
Data_All_Ideal <- Data_All %>% filter(Condition_name == "Ideal")
Data_All_Reasonable <- Data_All %>% filter(Condition_name == "Reasonable")

# Exclude participants whose answer to Q19 does not equal 15 [comprehension/attention question].
Data_All_Average_Pass1 <- Data_All_Average %>% filter(q19 == 15)
Data_All_Ideal_Pass1 <- Data_All_Ideal %>% filter(q19 == 15)
Data_All_Reasonable_Pass1 <- Data_All_Reasonable %>% filter(q19 == 15)

# Print participant counts
condition_counts <- data.frame(
  Condition = c("Average", "Ideal", "Reasonable"),
  Original_Count = c(nrow(Data_All_Average), nrow(Data_All_Ideal), nrow(Data_All_Reasonable)),
  After_Attention_Check = c(nrow(Data_All_Average_Pass1), nrow(Data_All_Ideal_Pass1), nrow(Data_All_Reasonable_Pass1)),
  Excluded_Count = c(nrow(Data_All_Average) - nrow(Data_All_Average_Pass1),
                     nrow(Data_All_Ideal) - nrow(Data_All_Ideal_Pass1),
                     nrow(Data_All_Reasonable) - nrow(Data_All_Reasonable_Pass1)),
  Exclusion_Rate = c((nrow(Data_All_Average) - nrow(Data_All_Average_Pass1))/nrow(Data_All_Average),
                     (nrow(Data_All_Ideal) - nrow(Data_All_Ideal_Pass1))/nrow(Data_All_Ideal),
                     (nrow(Data_All_Reasonable) - nrow(Data_All_Reasonable_Pass1))/nrow(Data_All_Reasonable))
)

# Format the exclusion rate as percentage
condition_counts$Exclusion_Rate <- scales::percent(condition_counts$Exclusion_Rate)

# Display the participant counts table
kable(condition_counts, caption = "Participant Counts Before and After Attention Check") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Participant Counts Before and After Attention Check
Condition Original_Count After_Attention_Check Excluded_Count Exclusion_Rate
Average 815 793 22 2.70%
Ideal 825 800 25 3.03%
Reasonable 786 758 28 3.56%
# Define the question columns - ensuring they actually exist in the data
question_cols <- c("q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", 
                  "q11", "q12", "q13", "q14", "q15", "q16", "q17", "q18", 
                  "q20", "q21", "q22", "q23", "q24", "q25", "q26", "q27", 
                  "q28", "q29", "q30", "q31", "q32", "q33", "q34")

# Check if all question columns exist in the data
existing_cols <- question_cols[question_cols %in% colnames(Data_All_Average_Pass1)]
missing_cols <- question_cols[!question_cols %in% colnames(Data_All_Average_Pass1)]

print("Existing question columns:")
## [1] "Existing question columns:"
print(existing_cols)
##  [1] "q1"  "q2"  "q3"  "q4"  "q5"  "q6"  "q7"  "q8"  "q9"  "q10" "q11" "q12"
## [13] "q13" "q14" "q15" "q16" "q17" "q18" "q20" "q21" "q22" "q23" "q24" "q25"
## [25] "q26" "q27" "q28" "q29" "q30" "q31" "q32" "q33" "q34"
print("Missing question columns:")
## [1] "Missing question columns:"
print(missing_cols)
## character(0)

3. Check Number of Participants per Condition

# Continue with only the existing columns
question_cols <- existing_cols

# Convert columns to numeric
Data_All_Average_Pass1 <- Data_All_Average_Pass1 %>%
  mutate(across(all_of(question_cols), ~as.numeric(as.character(.))))

Data_All_Ideal_Pass1 <- Data_All_Ideal_Pass1 %>%
  mutate(across(all_of(question_cols), ~as.numeric(as.character(.))))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(all_of(question_cols), ~as.numeric(as.character(.)))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
Data_All_Reasonable_Pass1 <- Data_All_Reasonable_Pass1 %>%
  mutate(across(all_of(question_cols), ~as.numeric(as.character(.))))
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(all_of(question_cols), ~as.numeric(as.character(.)))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
# Verify the number of participants in each condition
cat("Ideal condition:", nrow(Data_All_Ideal_Pass1), "participants\n")
## Ideal condition: 800 participants
cat("Average condition:", nrow(Data_All_Average_Pass1), "participants\n")
## Average condition: 793 participants
cat("Reasonable condition:", nrow(Data_All_Reasonable_Pass1), "participants\n")
## Reasonable condition: 758 participants

4a. Filter Data [Due to medians, exclusions of outliers]

No parts to be completed here: done in Analysis 3.

4b. Filter Data [Any response that is > 3SD from QMean, exclude]

# Function to filter data based on 3 SD criterion
filter_data <- function(data, cols) {
  # Pre-allocate a list to track excluded participants for each question
  exclusions_by_question <- list()
  
  # Original count
  original_count <- nrow(data)
  
  result <- data
  for (col in cols) {
    if (col %in% colnames(data)) {
      # Calculate mean and standard deviation
      avg <- mean(result[[col]], na.rm = TRUE)
      s <- sd(result[[col]], na.rm = TRUE)
      
      # Identify outliers
      outlier_indices <- which(!is.na(result[[col]]) & abs(result[[col]] - avg) > 3 * s)
      exclusions_by_question[[col]] <- length(outlier_indices)
      
      # Filter out outliers
      result <- result %>%
        filter(is.na(!!sym(col)) | abs(!!sym(col) - avg) <= 3 * s)
    }
  }
  
  # Final count
  final_count <- nrow(result)
  
  return(list(
    filtered_data = result,
    original_count = original_count,
    final_count = final_count,
    excluded_count = original_count - final_count,
    exclusion_rate = (original_count - final_count) / original_count,
    exclusions_by_question = exclusions_by_question
  ))
}

# Apply filtering and track exclusion counts
avg_filter_results <- filter_data(Data_All_Average_Pass1, question_cols)
ideal_filter_results <- filter_data(Data_All_Ideal_Pass1, question_cols)
reasonable_filter_results <- filter_data(Data_All_Reasonable_Pass1, question_cols)

# Extract the filtered data
Data_All_Average_Filtered <- avg_filter_results$filtered_data
Data_All_Ideal_Filtered <- ideal_filter_results$filtered_data
Data_All_Reasonable_Filtered <- reasonable_filter_results$filtered_data

# Display outlier exclusion summary
outlier_summary <- data.frame(
  Condition = c("Average", "Ideal", "Reasonable"),
  Original_Count = c(avg_filter_results$original_count, 
                    ideal_filter_results$original_count, 
                    reasonable_filter_results$original_count),
  After_Outlier_Removal = c(avg_filter_results$final_count, 
                           ideal_filter_results$final_count, 
                           reasonable_filter_results$final_count),
  Excluded_Count = c(avg_filter_results$excluded_count, 
                    ideal_filter_results$excluded_count, 
                    reasonable_filter_results$excluded_count),
  Exclusion_Rate = c(avg_filter_results$exclusion_rate,
                    ideal_filter_results$exclusion_rate,
                    reasonable_filter_results$exclusion_rate)
)

# Format the exclusion rate as percentage
outlier_summary$Exclusion_Rate <- scales::percent(outlier_summary$Exclusion_Rate)

# Display the outlier exclusion summary table
kable(outlier_summary, caption = "Participant Counts Before and After Outlier Removal") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Participant Counts Before and After Outlier Removal
Condition Original_Count After_Outlier_Removal Excluded_Count Exclusion_Rate
Average 793 609 184 23.20%
Ideal 800 608 192 24.00%
Reasonable 758 590 168 22.16%
# Define helper function for coefficients
get_model_coefs <- function(model) {
  summary_model <- summary(model)
  coef_table <- summary_model$coefficients
  data.frame(
    Term = rownames(coef_table),
    Estimate = coef_table[, "Estimate"],
    Std_Error = coef_table[, "Std. Error"],
    t_value = coef_table[, "t value"],
    p_value = coef_table[, "Pr(>|t|)"]
  )
}

# Define helper function for p-values
format_pvalue <- function(p_value) {
  if (is.na(p_value)) {
    return("NA")
  } else if (p_value < 0.001) {
    return("< .001")
  } else {
    # Format to 3 decimal places, removing leading zero if present
    formatted_p <- sprintf("%.3f", p_value)
    # Remove leading zero for values less than 1 (e.g., "0.123" -> ".123")
    formatted_p <- sub("^0\\.", ".", formatted_p)
     # Handle case where p-value rounds to 1.000
    if (formatted_p == "1.000" && p_value < 1) {
        return(".999") # Or indicate somehow it's slightly less than 1
    } else if (formatted_p == "1.000" && p_value >= 1) {
         return("1.000") # Or handle as appropriate if p can be >= 1
    }
    return(formatted_p)
  }
}

Analysis 3 [NO 3 SD Removed]: Overall results, with median ratings

1. Compute the median rating

# Calculate medians from attention-check filtered data ONLY
column_medians_average_no3sd <- sapply(Data_All_Average_Pass1[, question_cols], median, na.rm = TRUE)
column_medians_ideal_no3sd <- sapply(Data_All_Ideal_Pass1[, question_cols], median, na.rm = TRUE)
column_medians_reasonable_no3sd <- sapply(Data_All_Reasonable_Pass1[, question_cols], median, na.rm = TRUE)

# Create a dataframe for the medians (NO 3SD)
medians_df_no3sd <- data.frame(
  Question = names(column_medians_average_no3sd),
  Average = column_medians_average_no3sd,
  Ideal = column_medians_ideal_no3sd,
  Reasonable = column_medians_reasonable_no3sd
)

# Display the medians for each question
kable(medians_df_no3sd, caption = "Median Values (NO 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Median Values (NO 3SD Removal)
Question Average Ideal Reasonable
q1 q1 3.0 2.0 2.0
q2 q2 7.0 2.0 4.0
q3 q3 3.0 6.0 5.0
q4 q4 2000.0 2000.0 2000.0
q5 q5 20.0 30.0 30.0
q6 q6 10.0 1.0 3.0
q7 q7 15.0 5.0 10.0
q8 q8 5.0 12.0 10.0
q9 q9 5.0 3.0 5.0
q10 q10 5.0 0.0 2.0
q11 q11 500.0 0.0 10.0
q12 q12 30.0 1.0 10.0
q13 q13 40.0 10.0 15.0
q14 q14 10.0 2.0 5.0
q15 q15 9.0 10.0 8.0
q16 q16 5.0 5.0 4.0
q17 q17 3.0 0.0 1.0
q18 q18 15.0 2.0 9.5
q20 q20 20.0 0.0 4.0
q21 q21 8.0 5.0 5.0
q22 q22 10.0 7.0 7.0
q23 q23 2.0 3.0 3.0
q24 q24 16.0 24.0 24.0
q25 q25 2000.0 500.0 1000.0
q26 q26 8.0 2.0 4.0
q27 q27 12.0 5.0 6.0
q28 q28 10.0 20.0 20.0
q29 q29 55.0 90.0 80.0
q30 q30 12.0 4.0 4.0
q31 q31 50.0 13.0 25.0
q32 q32 24.0 48.0 40.0
q33 q33 5.5 2.5 4.0
q34 q34 50.0 10.0 50.0

2a. (Small constant inf values): Convert median responses to a natural log scale (NO 3SD Removal)

# Define Log function: Small constant for zero
log_smallvalue <- function(x) {
  x_safe <- x
  # Replace only actual zeros, leave NAs as NA
  x_safe[which(x == 0 & !is.na(x))] <- 0.01
  log(x_safe)
}

# Apply the log function to NO 3SD medians
log_medians_average_no3sd_smallval <- log_smallvalue(column_medians_average_no3sd)
log_medians_ideal_no3sd_smallval <- log_smallvalue(column_medians_ideal_no3sd)
log_medians_reasonable_no3sd_smallval <- log_smallvalue(column_medians_reasonable_no3sd)

# Create a dataframe for the log medians (NO 3SD, Small Constant)
log_medians_df_no3sd_smallval <- data.frame(
  Question = names(log_medians_average_no3sd_smallval),
  Average = log_medians_average_no3sd_smallval,
  Ideal = log_medians_ideal_no3sd_smallval,
  Reasonable = log_medians_reasonable_no3sd_smallval
)

# Display the log medians
kable(log_medians_df_no3sd_smallval, caption = "Log-Transformed (Small Constant) Median Values (NO 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Log-Transformed (Small Constant) Median Values (NO 3SD Removal)
Question Average Ideal Reasonable
q1 q1 1.0986123 0.6931472 0.6931472
q2 q2 1.9459101 0.6931472 1.3862944
q3 q3 1.0986123 1.7917595 1.6094379
q4 q4 7.6009025 7.6009025 7.6009025
q5 q5 2.9957323 3.4011974 3.4011974
q6 q6 2.3025851 0.0000000 1.0986123
q7 q7 2.7080502 1.6094379 2.3025851
q8 q8 1.6094379 2.4849066 2.3025851
q9 q9 1.6094379 1.0986123 1.6094379
q10 q10 1.6094379 -4.6051702 0.6931472
q11 q11 6.2146081 -4.6051702 2.3025851
q12 q12 3.4011974 0.0000000 2.3025851
q13 q13 3.6888795 2.3025851 2.7080502
q14 q14 2.3025851 0.6931472 1.6094379
q15 q15 2.1972246 2.3025851 2.0794415
q16 q16 1.6094379 1.6094379 1.3862944
q17 q17 1.0986123 -4.6051702 0.0000000
q18 q18 2.7080502 0.6931472 2.2512918
q20 q20 2.9957323 -4.6051702 1.3862944
q21 q21 2.0794415 1.6094379 1.6094379
q22 q22 2.3025851 1.9459101 1.9459101
q23 q23 0.6931472 1.0986123 1.0986123
q24 q24 2.7725887 3.1780538 3.1780538
q25 q25 7.6009025 6.2146081 6.9077553
q26 q26 2.0794415 0.6931472 1.3862944
q27 q27 2.4849066 1.6094379 1.7917595
q28 q28 2.3025851 2.9957323 2.9957323
q29 q29 4.0073332 4.4998097 4.3820266
q30 q30 2.4849066 1.3862944 1.3862944
q31 q31 3.9120230 2.5649494 3.2188758
q32 q32 3.1780538 3.8712010 3.6888795
q33 q33 1.7047481 0.9162907 1.3862944
q34 q34 3.9120230 2.3025851 3.9120230

2b. (Plus one to all values): Convert median responses to a natural log scale (NO 3SD Removal)

# Define Log function: Add 1
log_plus_one <- function(x) {
  log(x + 1)
}

# Apply the log function to NO 3SD medians
log_medians_average_no3sd_plusone <- log_plus_one(column_medians_average_no3sd)
log_medians_ideal_no3sd_plusone <- log_plus_one(column_medians_ideal_no3sd)
log_medians_reasonable_no3sd_plusone <- log_plus_one(column_medians_reasonable_no3sd)

# Create a dataframe for the log medians (NO 3SD, Plus One)
log_medians_df_no3sd_plusone <- data.frame(
  Question = names(log_medians_average_no3sd_plusone),
  Average = log_medians_average_no3sd_plusone,
  Ideal = log_medians_ideal_no3sd_plusone,
  Reasonable = log_medians_reasonable_no3sd_plusone
)

# Display the log medians
kable(log_medians_df_no3sd_plusone, caption = "Log-Transformed (Plus One) Median Values (NO 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Log-Transformed (Plus One) Median Values (NO 3SD Removal)
Question Average Ideal Reasonable
q1 q1 1.386294 1.0986123 1.0986123
q2 q2 2.079442 1.0986123 1.6094379
q3 q3 1.386294 1.9459101 1.7917595
q4 q4 7.601402 7.6014023 7.6014023
q5 q5 3.044522 3.4339872 3.4339872
q6 q6 2.397895 0.6931472 1.3862944
q7 q7 2.772589 1.7917595 2.3978953
q8 q8 1.791759 2.5649494 2.3978953
q9 q9 1.791759 1.3862944 1.7917595
q10 q10 1.791759 0.0000000 1.0986123
q11 q11 6.216606 0.0000000 2.3978953
q12 q12 3.433987 0.6931472 2.3978953
q13 q13 3.713572 2.3978953 2.7725887
q14 q14 2.397895 1.0986123 1.7917595
q15 q15 2.302585 2.3978953 2.1972246
q16 q16 1.791759 1.7917595 1.6094379
q17 q17 1.386294 0.0000000 0.6931472
q18 q18 2.772589 1.0986123 2.3513753
q20 q20 3.044522 0.0000000 1.6094379
q21 q21 2.197225 1.7917595 1.7917595
q22 q22 2.397895 2.0794415 2.0794415
q23 q23 1.098612 1.3862944 1.3862944
q24 q24 2.833213 3.2188758 3.2188758
q25 q25 7.601402 6.2166061 6.9087548
q26 q26 2.197225 1.0986123 1.6094379
q27 q27 2.564949 1.7917595 1.9459101
q28 q28 2.397895 3.0445224 3.0445224
q29 q29 4.025352 4.5108595 4.3944492
q30 q30 2.564949 1.6094379 1.6094379
q31 q31 3.931826 2.6390573 3.2580965
q32 q32 3.218876 3.8918203 3.7135721
q33 q33 1.871802 1.2527630 1.6094379
q34 q34 3.931826 2.3978953 3.9318256

3a. (Small constant inf values): Conduct three regressions and record AIC (NO 3SD Removal)

# Create data frame for regression (NO 3SD, Small Constant)
dataforAICMedians_no3sd_smallval <- data.frame(
  Reasonable = log_medians_reasonable_no3sd_smallval,
  Average = log_medians_average_no3sd_smallval,
  Ideal = log_medians_ideal_no3sd_smallval
)

# Filter out rows with non-finite values before regression
dataforAICMedians_no3sd_smallval_clean <- dataforAICMedians_no3sd_smallval %>%
  filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

# Check if enough data points remain
if(nrow(dataforAICMedians_no3sd_smallval_clean) > 3) {

  # Model I
  Model1Medians_no3sd_smallval <- lm(Reasonable ~ Average, data = dataforAICMedians_no3sd_smallval_clean)
  AIC1Medians_no3sd_smallval <- AIC(Model1Medians_no3sd_smallval)
  r2_model1_median_no3sd_smallval <- summary(Model1Medians_no3sd_smallval)$r.squared

  # Model II
  Model2Medians_no3sd_smallval <- lm(Reasonable ~ Ideal, data = dataforAICMedians_no3sd_smallval_clean)
  AIC2Medians_no3sd_smallval <- AIC(Model2Medians_no3sd_smallval)
  r2_model2_median_no3sd_smallval <- summary(Model2Medians_no3sd_smallval)$r.squared

  # Model III
  Model3Medians_no3sd_smallval <- lm(Reasonable ~ Average + Ideal, data = dataforAICMedians_no3sd_smallval_clean)
  AIC3Medians_no3sd_smallval <- AIC(Model3Medians_no3sd_smallval)
  r2_model3_median_no3sd_smallval <- summary(Model3Medians_no3sd_smallval)$r.squared

  # Create a regression summary dataframe
  regression_summary_medians_no3sd_smallval <- data.frame(
    Model = c("Average predicts Reasonable",
              "Ideal predicts Reasonable",
              "Average + Ideal predict Reasonable"),
    AIC = c(AIC1Medians_no3sd_smallval, AIC2Medians_no3sd_smallval, AIC3Medians_no3sd_smallval),
    R_squared = c(r2_model1_median_no3sd_smallval, r2_model2_median_no3sd_smallval, r2_model3_median_no3sd_smallval)
  )

  # Display the regression summary table
  kable(regression_summary_medians_no3sd_smallval, caption = "Regression Comparison (Median, NO 3SD, Log Small Constant)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

} else {
  cat("Not enough valid data points for regression (NO 3SD, Log Small Constant).\n")
  regression_summary_medians_no3sd_smallval <- data.frame(Model=character(), AIC=numeric(), R_squared=numeric())
}
Regression Comparison (Median, NO 3SD, Log Small Constant)
Model AIC R_squared
Average predicts Reasonable 85.96537 0.7350584
Ideal predicts Reasonable 102.37505 0.5643791
Average + Ideal predict Reasonable 39.38395 0.9392149

3b. (Plus one to all values): Conduct three regressions and record AIC (NO 3SD Removal)

# Create data frame for regression (NO 3SD, Plus One)
dataforAICMedians_no3sd_plusone <- data.frame(
  Reasonable = log_medians_reasonable_no3sd_plusone,
  Average = log_medians_average_no3sd_plusone,
  Ideal = log_medians_ideal_no3sd_plusone
)

# Filter out rows with non-finite values before regression
dataforAICMedians_no3sd_plusone_clean <- dataforAICMedians_no3sd_plusone %>%
  filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

# Check if enough data points remain
if(nrow(dataforAICMedians_no3sd_plusone_clean) > 3) {

  # Model I
  Model1Medians_no3sd_plusone <- lm(Reasonable ~ Average, data = dataforAICMedians_no3sd_plusone_clean)
  AIC1Medians_no3sd_plusone <- AIC(Model1Medians_no3sd_plusone)
  r2_model1_median_no3sd_plusone <- summary(Model1Medians_no3sd_plusone)$r.squared

  # Model II
  Model2Medians_no3sd_plusone <- lm(Reasonable ~ Ideal, data = dataforAICMedians_no3sd_plusone_clean)
  AIC2Medians_no3sd_plusone <- AIC(Model2Medians_no3sd_plusone)
  r2_model2_median_no3sd_plusone <- summary(Model2Medians_no3sd_plusone)$r.squared

  # Model III
  Model3Medians_no3sd_plusone <- lm(Reasonable ~ Average + Ideal, data = dataforAICMedians_no3sd_plusone_clean)
  AIC3Medians_no3sd_plusone <- AIC(Model3Medians_no3sd_plusone)
  r2_model3_median_no3sd_plusone <- summary(Model3Medians_no3sd_plusone)$r.squared

  # Create a regression summary dataframe
  regression_summary_medians_no3sd_plusone <- data.frame(
    Model = c("Average predicts Reasonable",
              "Ideal predicts Reasonable",
              "Average + Ideal predict Reasonable"),
    AIC = c(AIC1Medians_no3sd_plusone, AIC2Medians_no3sd_plusone, AIC3Medians_no3sd_plusone),
    R_squared = c(r2_model1_median_no3sd_plusone, r2_model2_median_no3sd_plusone, r2_model3_median_no3sd_plusone)
  )

  # Display the regression summary table
  kable(regression_summary_medians_no3sd_plusone, caption = "Regression Comparison (Median, NO 3SD, Log Plus One)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

} else {
   cat("Not enough valid data points for regression (NO 3SD, Log Plus One).\n")
   regression_summary_medians_no3sd_plusone <- data.frame(Model=character(), AIC=numeric(), R_squared=numeric())
}
Regression Comparison (Median, NO 3SD, Log Plus One)
Model AIC R_squared
Average predicts Reasonable 80.731562 0.7415783
Ideal predicts Reasonable 62.561052 0.8509963
Average + Ideal predict Reasonable 9.982732 0.9714949

4a. (Small constant inf values): Plots and Reporting (NO 3SD Removal)

# Check if models exist before plotting
if(exists("Model1Medians_no3sd_smallval") && exists("dataforAICMedians_no3sd_smallval_clean") && nrow(dataforAICMedians_no3sd_smallval_clean) > 0) {

  # Average Median vs Reasonable Median
  plot1_median_no3sd_smallval <- ggplot(dataforAICMedians_no3sd_smallval_clean, aes(x = Average, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "blue") +
    labs(title = "Model 1 (NO 3SD, Log Small Const): Median Average vs Reasonable",
         subtitle = paste("AIC =", round(AIC1Medians_no3sd_smallval, 2), ", R² =", round(r2_model1_median_no3sd_smallval, 3)),
         x = "Log (Small Const) Median Average Rating", y = "Log (Small Const) Median Reasonable Rating") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Ideal Median vs Reasonable Median
  plot2_median_no3sd_smallval <- ggplot(dataforAICMedians_no3sd_smallval_clean, aes(x = Ideal, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "red") +
    labs(title = "Model 2 (NO 3SD, Log Small Const): Median Ideal vs Reasonable",
         subtitle = paste("AIC =", round(AIC2Medians_no3sd_smallval, 2), ", R² =", round(r2_model2_median_no3sd_smallval, 3)),
         x = "Log (Small Const) Median Ideal Rating", y = "Log (Small Const) Median Reasonable Rating") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Average effect)
  new_data_median_no3sd_smallval <- expand.grid( Average = seq(min(dataforAICMedians_no3sd_smallval_clean$Average, na.rm=TRUE), max(dataforAICMedians_no3sd_smallval_clean$Average, na.rm=TRUE), length.out = 100), Ideal = median(dataforAICMedians_no3sd_smallval_clean$Ideal, na.rm=TRUE) )
  new_data_median_no3sd_smallval <- new_data_median_no3sd_smallval[is.finite(new_data_median_no3sd_smallval$Average) & is.finite(new_data_median_no3sd_smallval$Ideal),]
  if(nrow(new_data_median_no3sd_smallval)>0){ new_data_median_no3sd_smallval$predicted_reasonable <- predict(Model3Medians_no3sd_smallval, newdata = new_data_median_no3sd_smallval) } else { new_data_median_no3sd_smallval <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric())}
  plot3_median_no3sd_smallval <- ggplot(dataforAICMedians_no3sd_smallval_clean, aes(x = Average, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data_median_no3sd_smallval, aes(y = predicted_reasonable), color = "blue", size = 1) + labs(title = "Model 3 (NO 3SD, Log Small Const): Effect of Median Average", subtitle = paste("AIC =", round(AIC3Medians_no3sd_smallval, 2), ", R² =", round(r2_model3_median_no3sd_smallval, 3)), x = "Log (Small Const) Median Average Rating", y = "Log (Small Const) Median Reasonable Rating") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Ideal effect)
  new_data2_median_no3sd_smallval <- expand.grid( Average = median(dataforAICMedians_no3sd_smallval_clean$Average, na.rm=TRUE), Ideal = seq(min(dataforAICMedians_no3sd_smallval_clean$Ideal, na.rm=TRUE), max(dataforAICMedians_no3sd_smallval_clean$Ideal, na.rm=TRUE), length.out = 100) )
  new_data2_median_no3sd_smallval <- new_data2_median_no3sd_smallval[is.finite(new_data2_median_no3sd_smallval$Average) & is.finite(new_data2_median_no3sd_smallval$Ideal),]
  if(nrow(new_data2_median_no3sd_smallval)>0){ new_data2_median_no3sd_smallval$predicted_reasonable <- predict(Model3Medians_no3sd_smallval, newdata = new_data2_median_no3sd_smallval) } else { new_data2_median_no3sd_smallval <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric())}
  plot4_median_no3sd_smallval <- ggplot(dataforAICMedians_no3sd_smallval_clean, aes(x = Ideal, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data2_median_no3sd_smallval, aes(y = predicted_reasonable), color = "red", size = 1) + labs(title = "Model 3 (NO 3SD, Log Small Const): Effect of Median Ideal", subtitle = paste("AIC =", round(AIC3Medians_no3sd_smallval, 2), ", R² =", round(r2_model3_median_no3sd_smallval, 3)), x = "Log (Small Const) Median Ideal Rating", y = "Log (Small Const) Median Reasonable Rating") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Display plots
  print(plot1_median_no3sd_smallval)
  print(plot2_median_no3sd_smallval)
  print(plot3_median_no3sd_smallval)
  print(plot4_median_no3sd_smallval)

  # Save plots
  ggsave("model1_median_avg_reas_no3sd_smallval.png", plot1_median_no3sd_smallval, width = 8, height = 6, dpi = 300)
  ggsave("model2_median_ideal_reas_no3sd_smallval.png", plot2_median_no3sd_smallval, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_avg_eff_no3sd_smallval.png", plot3_median_no3sd_smallval, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_ideal_eff_no3sd_smallval.png", plot4_median_no3sd_smallval, width = 8, height = 6, dpi = 300)
  grid_plots_median_no3sd_smallval <- grid.arrange(plot1_median_no3sd_smallval, plot2_median_no3sd_smallval, plot3_median_no3sd_smallval, plot4_median_no3sd_smallval, ncol = 2)
  ggsave("all_median_plots_no3sd_smallval.png", grid_plots_median_no3sd_smallval, width = 14, height = 10, dpi = 300)

  # Correlation matrix
  cor_data_no3sd_smallval <- dataforAICMedians_no3sd_smallval_clean[complete.cases(dataforAICMedians_no3sd_smallval_clean),]
  if(nrow(cor_data_no3sd_smallval) > 1) {
   cor_matrix_no3sd_smallval <- cor(cor_data_no3sd_smallval, use = "complete.obs")
   corrplot(cor_matrix_no3sd_smallval, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", tl.srt = 45, diag = FALSE, title = "Correlation Matrix (NO 3SD, Log Small Constant)", mar = c(0,0,1,0))
  } else { cat("Not enough data for correlation matrix (NO 3SD, Log Small Constant).\n") }

  # Coefficient tables
  model1_coefs_no3sd_smallval <- get_model_coefs(Model1Medians_no3sd_smallval)
  model2_coefs_no3sd_smallval <- get_model_coefs(Model2Medians_no3sd_smallval)
  model3_coefs_no3sd_smallval <- get_model_coefs(Model3Medians_no3sd_smallval)
  model1_coefs_no3sd_smallval$p_value <- sapply(model1_coefs_no3sd_smallval$p_value, format_pvalue)
  model2_coefs_no3sd_smallval$p_value <- sapply(model2_coefs_no3sd_smallval$p_value, format_pvalue)
  model3_coefs_no3sd_smallval$p_value <- sapply(model3_coefs_no3sd_smallval$p_value, format_pvalue)
  kable(model1_coefs_no3sd_smallval, caption = "Model 1 Coeffs (NO 3SD, Log Small Const)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model2_coefs_no3sd_smallval, caption = "Model 2 Coeffs (NO 3SD, Log Small Const)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model3_coefs_no3sd_smallval, caption = "Model 3 Coeffs (NO 3SD, Log Small Const)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

  # Print model summaries
  cat("\nModel Summaries (NO 3SD, Log Small Constant):\n")
  print(summary(Model1Medians_no3sd_smallval))
  print(summary(Model2Medians_no3sd_smallval))
  print(summary(Model3Medians_no3sd_smallval))

} else {
   cat("Skipping plots and summaries for (NO 3SD, Log Small Constant) due to insufficient data.\n")
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## 
## Model Summaries (NO 3SD, Log Small Constant):
## 
## Call:
## lm(formula = Reasonable ~ Average, data = dataforAICMedians_no3sd_smallval_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.90856 -0.36496 -0.02592  0.62745  1.22985 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.01139    0.29154   0.039    0.969    
## Average      0.83670    0.09022   9.274 1.88e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8385 on 31 degrees of freedom
## Multiple R-squared:  0.7351, Adjusted R-squared:  0.7265 
## F-statistic: 86.01 on 1 and 31 DF,  p-value: 1.878e-10
## 
## 
## Call:
## lm(formula = Reasonable ~ Ideal, data = dataforAICMedians_no3sd_smallval_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3866 -0.6934 -0.1773  0.3218  2.5365 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.7771     0.2080   8.544 1.19e-09 ***
## Ideal         0.4367     0.0689   6.337 4.72e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.075 on 31 degrees of freedom
## Multiple R-squared:  0.5644, Adjusted R-squared:  0.5503 
## F-statistic: 40.16 on 1 and 31 DF,  p-value: 4.715e-07
## 
## 
## Call:
## lm(formula = Reasonable ~ Average + Ideal, data = dataforAICMedians_no3sd_smallval_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78273 -0.27802 -0.06571  0.33604  0.80237 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.16010    0.14273   1.122    0.271    
## Average      0.64924    0.04773  13.601 2.29e-14 ***
## Ideal        0.28538    0.02843  10.038 4.19e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4083 on 30 degrees of freedom
## Multiple R-squared:  0.9392, Adjusted R-squared:  0.9352 
## F-statistic: 231.8 on 2 and 30 DF,  p-value: < 2.2e-16
# Store results
analysis_3_results_no3sd_smallval <- list(
  medians_by_question = if(exists("medians_df_no3sd")) medians_df_no3sd else NULL,
  log_medians_by_question = if(exists("log_medians_df_no3sd_smallval")) log_medians_df_no3sd_smallval else NULL,
  regression_models = if(exists("Model1Medians_no3sd_smallval")) list(model1 = Model1Medians_no3sd_smallval, model2 = Model2Medians_no3sd_smallval, model3 = Model3Medians_no3sd_smallval) else NULL,
  regression_summary = if(exists("regression_summary_medians_no3sd_smallval")) regression_summary_medians_no3sd_smallval else NULL,
  aic_values = if(exists("AIC1Medians_no3sd_smallval")) c(AIC1 = AIC1Medians_no3sd_smallval, AIC2 = AIC2Medians_no3sd_smallval, AIC3 = AIC3Medians_no3sd_smallval) else NULL
)

4b. (Plus one to all values): Plots and Reporting (NO 3SD Removal)

# Check if models exist before plotting
if(exists("Model1Medians_no3sd_plusone") && exists("dataforAICMedians_no3sd_plusone_clean") && nrow(dataforAICMedians_no3sd_plusone_clean) > 0) {

  # Average Median vs Reasonable Median
  plot1_median_no3sd_plusone <- ggplot(dataforAICMedians_no3sd_plusone_clean, aes(x = Average, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "blue") +
    labs(title = "Model 1 (NO 3SD, Log+1): Median Average vs Reasonable",
         subtitle = paste("AIC =", round(AIC1Medians_no3sd_plusone, 2), ", R² =", round(r2_model1_median_no3sd_plusone, 3)),
         x = "Log (+1) Median Average Rating", y = "Log (+1) Median Reasonable Rating") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Ideal Median vs Reasonable Median
  plot2_median_no3sd_plusone <- ggplot(dataforAICMedians_no3sd_plusone_clean, aes(x = Ideal, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "red") +
    labs(title = "Model 2 (NO 3SD, Log+1): Median Ideal vs Reasonable",
         subtitle = paste("AIC =", round(AIC2Medians_no3sd_plusone, 2), ", R² =", round(r2_model2_median_no3sd_plusone, 3)),
         x = "Log (+1) Median Ideal Rating", y = "Log (+1) Median Reasonable Rating") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Average effect)
  new_data_median_no3sd_plusone <- expand.grid( Average = seq(min(dataforAICMedians_no3sd_plusone_clean$Average, na.rm=TRUE), max(dataforAICMedians_no3sd_plusone_clean$Average, na.rm=TRUE), length.out = 100), Ideal = median(dataforAICMedians_no3sd_plusone_clean$Ideal, na.rm=TRUE) )
  new_data_median_no3sd_plusone <- new_data_median_no3sd_plusone[is.finite(new_data_median_no3sd_plusone$Average) & is.finite(new_data_median_no3sd_plusone$Ideal),]
  if(nrow(new_data_median_no3sd_plusone)>0){ new_data_median_no3sd_plusone$predicted_reasonable <- predict(Model3Medians_no3sd_plusone, newdata = new_data_median_no3sd_plusone) } else { new_data_median_no3sd_plusone <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric())}
  plot3_median_no3sd_plusone <- ggplot(dataforAICMedians_no3sd_plusone_clean, aes(x = Average, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data_median_no3sd_plusone, aes(y = predicted_reasonable), color = "blue", size = 1) + labs(title = "Model 3 (NO 3SD, Log+1): Effect of Median Average", subtitle = paste("AIC =", round(AIC3Medians_no3sd_plusone, 2), ", R² =", round(r2_model3_median_no3sd_plusone, 3)), x = "Log (+1) Median Average Rating", y = "Log (+1) Median Reasonable Rating") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Ideal effect)
  new_data2_median_no3sd_plusone <- expand.grid( Average = median(dataforAICMedians_no3sd_plusone_clean$Average, na.rm=TRUE), Ideal = seq(min(dataforAICMedians_no3sd_plusone_clean$Ideal, na.rm=TRUE), max(dataforAICMedians_no3sd_plusone_clean$Ideal, na.rm=TRUE), length.out = 100) )
  new_data2_median_no3sd_plusone <- new_data2_median_no3sd_plusone[is.finite(new_data2_median_no3sd_plusone$Average) & is.finite(new_data2_median_no3sd_plusone$Ideal),]
  if(nrow(new_data2_median_no3sd_plusone)>0){ new_data2_median_no3sd_plusone$predicted_reasonable <- predict(Model3Medians_no3sd_plusone, newdata = new_data2_median_no3sd_plusone) } else { new_data2_median_no3sd_plusone <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric())}
  plot4_median_no3sd_plusone <- ggplot(dataforAICMedians_no3sd_plusone_clean, aes(x = Ideal, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data2_median_no3sd_plusone, aes(y = predicted_reasonable), color = "red", size = 1) + labs(title = "Model 3 (NO 3SD, Log+1): Effect of Median Ideal", subtitle = paste("AIC =", round(AIC3Medians_no3sd_plusone, 2), ", R² =", round(r2_model3_median_no3sd_plusone, 3)), x = "Log (+1) Median Ideal Rating", y = "Log (+1) Median Reasonable Rating") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Display plots
  print(plot1_median_no3sd_plusone)
  print(plot2_median_no3sd_plusone)
  print(plot3_median_no3sd_plusone)
  print(plot4_median_no3sd_plusone)

  # Save plots
  ggsave("model1_median_avg_reas_no3sd_plusone.png", plot1_median_no3sd_plusone, width = 8, height = 6, dpi = 300)
  ggsave("model2_median_ideal_reas_no3sd_plusone.png", plot2_median_no3sd_plusone, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_avg_eff_no3sd_plusone.png", plot3_median_no3sd_plusone, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_ideal_eff_no3sd_plusone.png", plot4_median_no3sd_plusone, width = 8, height = 6, dpi = 300)
  grid_plots_median_no3sd_plusone <- grid.arrange(plot1_median_no3sd_plusone, plot2_median_no3sd_plusone, plot3_median_no3sd_plusone, plot4_median_no3sd_plusone, ncol = 2)
  ggsave("all_median_plots_no3sd_plusone.png", grid_plots_median_no3sd_plusone, width = 14, height = 10, dpi = 300)

  # Correlation matrix
  cor_data_no3sd_plusone <- dataforAICMedians_no3sd_plusone_clean[complete.cases(dataforAICMedians_no3sd_plusone_clean),]
  if(nrow(cor_data_no3sd_plusone) > 1) {
   cor_matrix_no3sd_plusone <- cor(cor_data_no3sd_plusone, use = "complete.obs")
   corrplot(cor_matrix_no3sd_plusone, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", tl.srt = 45, diag = FALSE, title = "Correlation Matrix (NO 3SD, Log Plus One)", mar = c(0,0,1,0))
  } else { cat("Not enough data for correlation matrix (NO 3SD, Log Plus One).\n") }

  # Coefficient tables
  model1_coefs_no3sd_plusone <- get_model_coefs(Model1Medians_no3sd_plusone)
  model2_coefs_no3sd_plusone <- get_model_coefs(Model2Medians_no3sd_plusone)
  model3_coefs_no3sd_plusone <- get_model_coefs(Model3Medians_no3sd_plusone)
  model1_coefs_no3sd_plusone$p_value <- sapply(model1_coefs_no3sd_plusone$p_value, format_pvalue)
  model2_coefs_no3sd_plusone$p_value <- sapply(model2_coefs_no3sd_plusone$p_value, format_pvalue)
  model3_coefs_no3sd_plusone$p_value <- sapply(model3_coefs_no3sd_plusone$p_value, format_pvalue)
  kable(model1_coefs_no3sd_plusone, caption = "Model 1 Coeffs (NO 3SD, Log Plus One)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model2_coefs_no3sd_plusone, caption = "Model 2 Coeffs (NO 3SD, Log Plus One)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model3_coefs_no3sd_plusone, caption = "Model 3 Coeffs (NO 3SD, Log Plus One)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

  # Print model summaries
  cat("\nModel Summaries (NO 3SD, Log Plus One):\n")
  print(summary(Model1Medians_no3sd_plusone))
  print(summary(Model2Medians_no3sd_plusone))
  print(summary(Model3Medians_no3sd_plusone))

} else {
  cat("Skipping plots and summaries for (NO 3SD, Log Plus One) due to insufficient data.\n")
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## 
## Model Summaries (NO 3SD, Log Plus One):
## 
## Call:
## lm(formula = Reasonable ~ Average, data = dataforAICMedians_no3sd_plusone_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.83433 -0.32028 -0.05108  0.53872  1.23137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.12437    0.28692   0.433    0.668    
## Average      0.82165    0.08711   9.432 1.27e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7746 on 31 degrees of freedom
## Multiple R-squared:  0.7416, Adjusted R-squared:  0.7332 
## F-statistic: 88.96 on 1 and 31 DF,  p-value: 1.271e-10
## 
## 
## Call:
## lm(formula = Reasonable ~ Ideal, data = dataforAICMedians_no3sd_plusone_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6811 -0.4486 -0.1409  0.2681  1.5866 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.81133    0.16383   4.952 2.46e-05 ***
## Ideal        0.82556    0.06204  13.306 2.34e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5882 on 31 degrees of freedom
## Multiple R-squared:  0.851,  Adjusted R-squared:  0.8462 
## F-statistic:   177 on 1 and 31 DF,  p-value: 2.345e-14
## 
## 
## Call:
## lm(formula = Reasonable ~ Average + Ideal, data = dataforAICMedians_no3sd_plusone_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50094 -0.21285  0.00378  0.08933  0.78470 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09186    0.09689   0.948    0.351    
## Average      0.43408    0.03855  11.261 2.68e-12 ***
## Ideal        0.56239    0.03615  15.556 6.64e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2615 on 30 degrees of freedom
## Multiple R-squared:  0.9715, Adjusted R-squared:  0.9696 
## F-statistic: 511.2 on 2 and 30 DF,  p-value: < 2.2e-16
# Store results
analysis_3_results_no3sd_plusone <- list(
  medians_by_question = if(exists("medians_df_no3sd")) medians_df_no3sd else NULL,
  log_medians_by_question = if(exists("log_medians_df_no3sd_plusone")) log_medians_df_no3sd_plusone else NULL,
  regression_models = if(exists("Model1Medians_no3sd_plusone")) list(model1 = Model1Medians_no3sd_plusone, model2 = Model2Medians_no3sd_plusone, model3 = Model3Medians_no3sd_plusone) else NULL,
  regression_summary = if(exists("regression_summary_medians_no3sd_plusone")) regression_summary_medians_no3sd_plusone else NULL,
  aic_values = if(exists("AIC1Medians_no3sd_plusone")) c(AIC1 = AIC1Medians_no3sd_plusone, AIC2 = AIC2Medians_no3sd_plusone, AIC3 = AIC3Medians_no3sd_plusone) else NULL
)

Analysis 3 [WITH 3 SD removed]: Overall results, with median ratings

1. Compute the median rating

# Calculate medians from data filtered for attention check AND 3SD outliers
column_medians_average_3sd <- sapply(Data_All_Average_Filtered[, question_cols], median, na.rm = TRUE)
column_medians_ideal_3sd <- sapply(Data_All_Ideal_Filtered[, question_cols], median, na.rm = TRUE)
column_medians_reasonable_3sd <- sapply(Data_All_Reasonable_Filtered[, question_cols], median, na.rm = TRUE)

# Create a dataframe for the medians (WITH 3SD)
medians_df_3sd <- data.frame(
  Question = names(column_medians_average_3sd),
  Average = column_medians_average_3sd,
  Ideal = column_medians_ideal_3sd,
  Reasonable = column_medians_reasonable_3sd
)

# Display the medians for each question
kable(medians_df_3sd, caption = "Median Values (WITH 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Median Values (WITH 3SD Removal)
Question Average Ideal Reasonable
q1 q1 3.0 2 2
q2 q2 7.0 2 3
q3 q3 3.0 6 5
q4 q4 2000.0 2000 2000
q5 q5 20.0 30 30
q6 q6 10.0 1 3
q7 q7 15.0 5 10
q8 q8 5.0 12 10
q9 q9 5.0 3 5
q10 q10 5.0 0 2
q11 q11 500.0 0 7
q12 q12 30.0 0 10
q13 q13 40.0 10 15
q14 q14 10.0 2 5
q15 q15 8.0 10 8
q16 q16 4.0 5 4
q17 q17 3.0 0 1
q18 q18 10.0 1 5
q20 q20 20.0 0 2
q21 q21 8.0 5 5
q22 q22 7.0 7 7
q23 q23 2.0 2 3
q24 q24 15.0 24 24
q25 q25 2000.0 500 1000
q26 q26 8.0 2 4
q27 q27 11.5 4 6
q28 q28 10.0 20 20
q29 q29 54.5 90 80
q30 q30 12.0 3 4
q31 q31 50.0 10 20
q32 q32 24.0 48 40
q33 q33 5.0 2 3
q34 q34 50.0 5 50

2a. (Small constant inf values): Convert median responses to a natural log scale (WITH 3SD Removal)

# Define Log function: Small constant for zero
log_smallvalue <- function(x) {
  x_safe <- x
  # Replace only actual zeros, leave NAs as NA
  x_safe[which(x == 0 & !is.na(x))] <- 0.01
  log(x_safe)
}

# Apply the log function: log_smallvalue (defined earlier)
log_medians_average_3sd_smallval <- log_smallvalue(column_medians_average_3sd)
log_medians_ideal_3sd_smallval <- log_smallvalue(column_medians_ideal_3sd)
log_medians_reasonable_3sd_smallval <- log_smallvalue(column_medians_reasonable_3sd)

# Create a dataframe for the log medians (WITH 3SD, Small Constant)
log_medians_df_3sd_smallval <- data.frame(
  Question = names(log_medians_average_3sd_smallval),
  Average = log_medians_average_3sd_smallval,
  Ideal = log_medians_ideal_3sd_smallval,
  Reasonable = log_medians_reasonable_3sd_smallval
)

# Display the log medians
kable(log_medians_df_3sd_smallval, caption = "Log-Transformed (Small Constant) Median Values (WITH 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Log-Transformed (Small Constant) Median Values (WITH 3SD Removal)
Question Average Ideal Reasonable
q1 q1 1.0986123 0.6931472 0.6931472
q2 q2 1.9459101 0.6931472 1.0986123
q3 q3 1.0986123 1.7917595 1.6094379
q4 q4 7.6009025 7.6009025 7.6009025
q5 q5 2.9957323 3.4011974 3.4011974
q6 q6 2.3025851 0.0000000 1.0986123
q7 q7 2.7080502 1.6094379 2.3025851
q8 q8 1.6094379 2.4849066 2.3025851
q9 q9 1.6094379 1.0986123 1.6094379
q10 q10 1.6094379 -4.6051702 0.6931472
q11 q11 6.2146081 -4.6051702 1.9459101
q12 q12 3.4011974 -4.6051702 2.3025851
q13 q13 3.6888795 2.3025851 2.7080502
q14 q14 2.3025851 0.6931472 1.6094379
q15 q15 2.0794415 2.3025851 2.0794415
q16 q16 1.3862944 1.6094379 1.3862944
q17 q17 1.0986123 -4.6051702 0.0000000
q18 q18 2.3025851 0.0000000 1.6094379
q20 q20 2.9957323 -4.6051702 0.6931472
q21 q21 2.0794415 1.6094379 1.6094379
q22 q22 1.9459101 1.9459101 1.9459101
q23 q23 0.6931472 0.6931472 1.0986123
q24 q24 2.7080502 3.1780538 3.1780538
q25 q25 7.6009025 6.2146081 6.9077553
q26 q26 2.0794415 0.6931472 1.3862944
q27 q27 2.4423470 1.3862944 1.7917595
q28 q28 2.3025851 2.9957323 2.9957323
q29 q29 3.9982007 4.4998097 4.3820266
q30 q30 2.4849066 1.0986123 1.3862944
q31 q31 3.9120230 2.3025851 2.9957323
q32 q32 3.1780538 3.8712010 3.6888795
q33 q33 1.6094379 0.6931472 1.0986123
q34 q34 3.9120230 1.6094379 3.9120230

2b. (Plus one to all values): Convert median responses to a natural log scale (WITH 3SD Removal)

# Define Log function: Add 1
log_plus_one <- function(x) {
  log(x + 1)
}

# Apply the log function: log_plus_one (defined earlier)
log_medians_average_3sd_plusone <- log_plus_one(column_medians_average_3sd)
log_medians_ideal_3sd_plusone <- log_plus_one(column_medians_ideal_3sd)
log_medians_reasonable_3sd_plusone <- log_plus_one(column_medians_reasonable_3sd)

# Create a dataframe for the log medians (WITH 3SD, Plus One)
log_medians_df_3sd_plusone <- data.frame(
  Question = names(log_medians_average_3sd_plusone),
  Average = log_medians_average_3sd_plusone,
  Ideal = log_medians_ideal_3sd_plusone,
  Reasonable = log_medians_reasonable_3sd_plusone
)

# Display the log medians
kable(log_medians_df_3sd_plusone, caption = "Log-Transformed (Plus One) Median Values (WITH 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Log-Transformed (Plus One) Median Values (WITH 3SD Removal)
Question Average Ideal Reasonable
q1 q1 1.386294 1.0986123 1.0986123
q2 q2 2.079442 1.0986123 1.3862944
q3 q3 1.386294 1.9459101 1.7917595
q4 q4 7.601402 7.6014023 7.6014023
q5 q5 3.044522 3.4339872 3.4339872
q6 q6 2.397895 0.6931472 1.3862944
q7 q7 2.772589 1.7917595 2.3978953
q8 q8 1.791759 2.5649494 2.3978953
q9 q9 1.791759 1.3862944 1.7917595
q10 q10 1.791759 0.0000000 1.0986123
q11 q11 6.216606 0.0000000 2.0794415
q12 q12 3.433987 0.0000000 2.3978953
q13 q13 3.713572 2.3978953 2.7725887
q14 q14 2.397895 1.0986123 1.7917595
q15 q15 2.197225 2.3978953 2.1972246
q16 q16 1.609438 1.7917595 1.6094379
q17 q17 1.386294 0.0000000 0.6931472
q18 q18 2.397895 0.6931472 1.7917595
q20 q20 3.044522 0.0000000 1.0986123
q21 q21 2.197225 1.7917595 1.7917595
q22 q22 2.079442 2.0794415 2.0794415
q23 q23 1.098612 1.0986123 1.3862944
q24 q24 2.772589 3.2188758 3.2188758
q25 q25 7.601402 6.2166061 6.9087548
q26 q26 2.197225 1.0986123 1.6094379
q27 q27 2.525729 1.6094379 1.9459101
q28 q28 2.397895 3.0445224 3.0445224
q29 q29 4.016383 4.5108595 4.3944492
q30 q30 2.564949 1.3862944 1.6094379
q31 q31 3.931826 2.3978953 3.0445224
q32 q32 3.218876 3.8918203 3.7135721
q33 q33 1.791759 1.0986123 1.3862944
q34 q34 3.931826 1.7917595 3.9318256

3a. (Small constant inf values): Conduct three regressions and record AIC (WITH 3SD Removal)

# Create data frame for regression (WITH 3SD, Small Constant)
dataforAICMedians_3sd_smallval <- data.frame(
  Reasonable = log_medians_reasonable_3sd_smallval,
  Average = log_medians_average_3sd_smallval,
  Ideal = log_medians_ideal_3sd_smallval
)

# Filter out rows with non-finite values before regression
dataforAICMedians_3sd_smallval_clean <- dataforAICMedians_3sd_smallval %>%
  filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

# Check if enough data points remain
if(nrow(dataforAICMedians_3sd_smallval_clean) > 3) {

  # Model I
  Model1Medians_3sd_smallval <- lm(Reasonable ~ Average, data = dataforAICMedians_3sd_smallval_clean)
  AIC1Medians_3sd_smallval <- AIC(Model1Medians_3sd_smallval)
  r2_model1_median_3sd_smallval <- summary(Model1Medians_3sd_smallval)$r.squared

  # Model II
  Model2Medians_3sd_smallval <- lm(Reasonable ~ Ideal, data = dataforAICMedians_3sd_smallval_clean)
  AIC2Medians_3sd_smallval <- AIC(Model2Medians_3sd_smallval)
  r2_model2_median_3sd_smallval <- summary(Model2Medians_3sd_smallval)$r.squared

  # Model III
  Model3Medians_3sd_smallval <- lm(Reasonable ~ Average + Ideal, data = dataforAICMedians_3sd_smallval_clean)
  AIC3Medians_3sd_smallval <- AIC(Model3Medians_3sd_smallval)
  r2_model3_median_3sd_smallval <- summary(Model3Medians_3sd_smallval)$r.squared

  # Create a regression summary dataframe
  regression_summary_medians_3sd_smallval <- data.frame(
    Model = c("Average predicts Reasonable",
              "Ideal predicts Reasonable",
              "Average + Ideal predict Reasonable"),
    AIC = c(AIC1Medians_3sd_smallval, AIC2Medians_3sd_smallval, AIC3Medians_3sd_smallval),
    R_squared = c(r2_model1_median_3sd_smallval, r2_model2_median_3sd_smallval, r2_model3_median_3sd_smallval)
  )

  # Display the regression summary table
  kable(regression_summary_medians_3sd_smallval, caption = "Regression Comparison (Median, WITH 3SD, Log Small Constant)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

} else {
   cat("Not enough valid data points for regression (WITH 3SD, Log Small Constant).\n")
   regression_summary_medians_3sd_smallval <- data.frame(Model=character(), AIC=numeric(), R_squared=numeric())
}
Regression Comparison (Median, WITH 3SD, Log Small Constant)
Model AIC R_squared
Average predicts Reasonable 92.68637 0.6876407
Ideal predicts Reasonable 105.28113 0.5424832
Average + Ideal predict Reasonable 50.95398 0.9169926

3b. (Plus one to all values): Conduct three regressions and record AIC (WITH 3SD Removal)

# Create data frame for regression (WITH 3SD, Plus One)
dataforAICMedians_3sd_plusone <- data.frame(
  Reasonable = log_medians_reasonable_3sd_plusone,
  Average = log_medians_average_3sd_plusone,
  Ideal = log_medians_ideal_3sd_plusone
)

# Filter out rows with non-finite values before regression
dataforAICMedians_3sd_plusone_clean <- dataforAICMedians_3sd_plusone %>%
  filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

# Check if enough data points remain
if(nrow(dataforAICMedians_3sd_plusone_clean) > 3) {

  # Model I
  Model1Medians_3sd_plusone <- lm(Reasonable ~ Average, data = dataforAICMedians_3sd_plusone_clean)
  AIC1Medians_3sd_plusone <- AIC(Model1Medians_3sd_plusone)
  r2_model1_median_3sd_plusone <- summary(Model1Medians_3sd_plusone)$r.squared

  # Model II
  Model2Medians_3sd_plusone <- lm(Reasonable ~ Ideal, data = dataforAICMedians_3sd_plusone_clean)
  AIC2Medians_3sd_plusone <- AIC(Model2Medians_3sd_plusone)
  r2_model2_median_3sd_plusone <- summary(Model2Medians_3sd_plusone)$r.squared

  # Model III
  Model3Medians_3sd_plusone <- lm(Reasonable ~ Average + Ideal, data = dataforAICMedians_3sd_plusone_clean)
  AIC3Medians_3sd_plusone <- AIC(Model3Medians_3sd_plusone)
  r2_model3_median_3sd_plusone <- summary(Model3Medians_3sd_plusone)$r.squared

  # Create a regression summary dataframe
  regression_summary_medians_3sd_plusone <- data.frame(
    Model = c("Average predicts Reasonable",
              "Ideal predicts Reasonable",
              "Average + Ideal predict Reasonable"),
    AIC = c(AIC1Medians_3sd_plusone, AIC2Medians_3sd_plusone, AIC3Medians_3sd_plusone),
    R_squared = c(r2_model1_median_3sd_plusone, r2_model2_median_3sd_plusone, r2_model3_median_3sd_plusone)
  )

  # Display the regression summary table
  kable(regression_summary_medians_3sd_plusone, caption = "Regression Comparison (Median, WITH 3SD, Log Plus One)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

} else {
  cat("Not enough valid data points for regression (WITH 3SD, Log Plus One).\n")
  regression_summary_medians_3sd_plusone <- data.frame(Model=character(), AIC=numeric(), R_squared=numeric())
}
Regression Comparison (Median, WITH 3SD, Log Plus One)
Model AIC R_squared
Average predicts Reasonable 86.73858 0.6998228
Ideal predicts Reasonable 65.11702 0.8441063
Average + Ideal predict Reasonable 25.78123 0.9554517

4a. (Small constant inf values): Plots and Reporting (WITH 3SD Removal)

# Check if models exist before plotting
if(exists("Model1Medians_3sd_smallval") && exists("dataforAICMedians_3sd_smallval_clean") && nrow(dataforAICMedians_3sd_smallval_clean) > 0) {

  # Average Median vs Reasonable Median
  plot1_median_3sd_smallval <- ggplot(dataforAICMedians_3sd_smallval_clean, aes(x = Average, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "blue") +
    labs(title = "Model 1 (WITH 3SD, Log Small Const): Median Average vs Reasonable",
         subtitle = paste("AIC =", round(AIC1Medians_3sd_smallval, 2), ", R² =", round(r2_model1_median_3sd_smallval, 3)),
         x = "Log (Small Const) Median Average Rating (3SD Removed)",
         y = "Log (Small Const) Median Reasonable Rating (3SD Removed)") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Ideal Median vs Reasonable Median
  plot2_median_3sd_smallval <- ggplot(dataforAICMedians_3sd_smallval_clean, aes(x = Ideal, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "red") +
    labs(title = "Model 2 (WITH 3SD, Log Small Const): Median Ideal vs Reasonable",
         subtitle = paste("AIC =", round(AIC2Medians_3sd_smallval, 2), ", R² =", round(r2_model2_median_3sd_smallval, 3)),
         x = "Log (Small Const) Median Ideal Rating (3SD Removed)",
         y = "Log (Small Const) Median Reasonable Rating (3SD Removed)") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Average effect)
  new_data_median_3sd_smallval <- expand.grid( Average = seq(min(dataforAICMedians_3sd_smallval_clean$Average, na.rm=TRUE), max(dataforAICMedians_3sd_smallval_clean$Average, na.rm=TRUE), length.out = 100), Ideal = median(dataforAICMedians_3sd_smallval_clean$Ideal, na.rm=TRUE) )
  new_data_median_3sd_smallval <- new_data_median_3sd_smallval[is.finite(new_data_median_3sd_smallval$Average) & is.finite(new_data_median_3sd_smallval$Ideal),]
  if(nrow(new_data_median_3sd_smallval)>0){ new_data_median_3sd_smallval$predicted_reasonable <- predict(Model3Medians_3sd_smallval, newdata = new_data_median_3sd_smallval) } else { new_data_median_3sd_smallval <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric()) }
  plot3_median_3sd_smallval <- ggplot(dataforAICMedians_3sd_smallval_clean, aes(x = Average, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data_median_3sd_smallval, aes(y = predicted_reasonable), color = "blue", size = 1) + labs(title = "Model 3 (WITH 3SD, Log Small Const): Effect of Median Average", subtitle = paste("AIC =", round(AIC3Medians_3sd_smallval, 2), ", R² =", round(r2_model3_median_3sd_smallval, 3)), x = "Log (Small Const) Median Average Rating (3SD Removed)", y = "Log (Small Const) Median Reasonable Rating (3SD Removed)") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Ideal effect)
  new_data2_median_3sd_smallval <- expand.grid( Average = median(dataforAICMedians_3sd_smallval_clean$Average, na.rm=TRUE), Ideal = seq(min(dataforAICMedians_3sd_smallval_clean$Ideal, na.rm=TRUE), max(dataforAICMedians_3sd_smallval_clean$Ideal, na.rm=TRUE), length.out = 100) )
  new_data2_median_3sd_smallval <- new_data2_median_3sd_smallval[is.finite(new_data2_median_3sd_smallval$Average) & is.finite(new_data2_median_3sd_smallval$Ideal),]
  if(nrow(new_data2_median_3sd_smallval)>0){ new_data2_median_3sd_smallval$predicted_reasonable <- predict(Model3Medians_3sd_smallval, newdata = new_data2_median_3sd_smallval) } else { new_data2_median_3sd_smallval <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric()) }
  plot4_median_3sd_smallval <- ggplot(dataforAICMedians_3sd_smallval_clean, aes(x = Ideal, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data2_median_3sd_smallval, aes(y = predicted_reasonable), color = "red", size = 1) + labs(title = "Model 3 (WITH 3SD, Log Small Const): Effect of Median Ideal", subtitle = paste("AIC =", round(AIC3Medians_3sd_smallval, 2), ", R² =", round(r2_model3_median_3sd_smallval, 3)), x = "Log (Small Const) Median Ideal Rating (3SD Removed)", y = "Log (Small Const) Median Reasonable Rating (3SD Removed)") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Display plots
  print(plot1_median_3sd_smallval)
  print(plot2_median_3sd_smallval)
  print(plot3_median_3sd_smallval)
  print(plot4_median_3sd_smallval)

  # Save plots
  ggsave("model1_median_avg_reas_3sd_smallval.png", plot1_median_3sd_smallval, width = 8, height = 6, dpi = 300)
  ggsave("model2_median_ideal_reas_3sd_smallval.png", plot2_median_3sd_smallval, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_avg_eff_3sd_smallval.png", plot3_median_3sd_smallval, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_ideal_eff_3sd_smallval.png", plot4_median_3sd_smallval, width = 8, height = 6, dpi = 300)
  grid_plots_median_3sd_smallval <- grid.arrange(plot1_median_3sd_smallval, plot2_median_3sd_smallval, plot3_median_3sd_smallval, plot4_median_3sd_smallval, ncol = 2)
  ggsave("all_median_plots_3sd_smallval.png", grid_plots_median_3sd_smallval, width = 14, height = 10, dpi = 300)

  # Correlation matrix
  cor_data_3sd_smallval <- dataforAICMedians_3sd_smallval_clean[complete.cases(dataforAICMedians_3sd_smallval_clean),]
  if(nrow(cor_data_3sd_smallval) > 1) {
   cor_matrix_3sd_smallval <- cor(cor_data_3sd_smallval, use = "complete.obs")
   corrplot(cor_matrix_3sd_smallval, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", tl.srt = 45, diag = FALSE, title = "Correlation Matrix (WITH 3SD, Log Small Constant)", mar = c(0,0,1,0))
  } else { cat("Not enough data for correlation matrix (WITH 3SD, Log Small Constant).\n") }

  # Coefficient tables
  model1_coefs_3sd_smallval <- get_model_coefs(Model1Medians_3sd_smallval)
  model2_coefs_3sd_smallval <- get_model_coefs(Model2Medians_3sd_smallval)
  model3_coefs_3sd_smallval <- get_model_coefs(Model3Medians_3sd_smallval)
  model1_coefs_3sd_smallval$p_value <- sapply(model1_coefs_3sd_smallval$p_value, format_pvalue)
  model2_coefs_3sd_smallval$p_value <- sapply(model2_coefs_3sd_smallval$p_value, format_pvalue)
  model3_coefs_3sd_smallval$p_value <- sapply(model3_coefs_3sd_smallval$p_value, format_pvalue)
  kable(model1_coefs_3sd_smallval, caption = "Model 1 Coeffs (WITH 3SD, Log Small Const)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model2_coefs_3sd_smallval, caption = "Model 2 Coeffs (WITH 3SD, Log Small Const)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model3_coefs_3sd_smallval, caption = "Model 3 Coeffs (WITH 3SD, Log Small Const)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

  # Print model summaries
  cat("\nModel Summaries (WITH 3SD, Log Small Constant):\n")
  print(summary(Model1Medians_3sd_smallval))
  print(summary(Model2Medians_3sd_smallval))
  print(summary(Model3Medians_3sd_smallval))

} else {
   cat("Skipping plots and summaries for (WITH 3SD, Log Small Constant) due to insufficient data.\n")
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## 
## Model Summaries (WITH 3SD, Log Small Constant):
## 
## Call:
## lm(formula = Reasonable ~ Average, data = dataforAICMedians_3sd_smallval_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1568 -0.3358 -0.1127  0.6892  1.3648 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.02213    0.31715   0.070    0.945    
## Average      0.81753    0.09896   8.261 2.49e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9284 on 31 degrees of freedom
## Multiple R-squared:  0.6876, Adjusted R-squared:  0.6776 
## F-statistic: 68.24 on 1 and 31 DF,  p-value: 2.486e-09
## 
## 
## Call:
## lm(formula = Reasonable ~ Ideal, data = dataforAICMedians_3sd_smallval_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4195 -0.7295 -0.2187  0.2721  2.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.82816    0.20911   8.743 7.16e-10 ***
## Ideal        0.41036    0.06769   6.063 1.03e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.124 on 31 degrees of freedom
## Multiple R-squared:  0.5425, Adjusted R-squared:  0.5277 
## F-statistic: 36.76 on 1 and 31 DF,  p-value: 1.027e-06
## 
## 
## Call:
## lm(formula = Reasonable ~ Average + Ideal, data = dataforAICMedians_3sd_smallval_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93341 -0.36561 -0.06373  0.34015  1.23273 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.19225    0.16724   1.150    0.259    
## Average      0.64316    0.05528  11.634 1.20e-12 ***
## Ideal        0.28444    0.03124   9.104 3.88e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4865 on 30 degrees of freedom
## Multiple R-squared:  0.917,  Adjusted R-squared:  0.9115 
## F-statistic: 165.7 on 2 and 30 DF,  p-value: < 2.2e-16
# Store results
analysis_3_results_3sd_smallval <- list(
  medians_by_question = if(exists("medians_df_3sd")) medians_df_3sd else NULL,
  log_medians_by_question = if(exists("log_medians_df_3sd_smallval")) log_medians_df_3sd_smallval else NULL,
  regression_models = if(exists("Model1Medians_3sd_smallval")) list(model1 = Model1Medians_3sd_smallval, model2 = Model2Medians_3sd_smallval, model3 = Model3Medians_3sd_smallval) else NULL,
  regression_summary = if(exists("regression_summary_medians_3sd_smallval")) regression_summary_medians_3sd_smallval else NULL,
  aic_values = if(exists("AIC1Medians_3sd_smallval")) c(AIC1 = AIC1Medians_3sd_smallval, AIC2 = AIC2Medians_3sd_smallval, AIC3 = AIC3Medians_3sd_smallval) else NULL
)

4b. (Plus one to all values): Plots and Reporting (WITH 3SD Removal)

# Check if models exist before plotting
if(exists("Model1Medians_3sd_plusone") && exists("dataforAICMedians_3sd_plusone_clean") && nrow(dataforAICMedians_3sd_plusone_clean) > 0) {

  # Average Median vs Reasonable Median
  plot1_median_3sd_plusone <- ggplot(dataforAICMedians_3sd_plusone_clean, aes(x = Average, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "blue") +
    labs(title = "Model 1 (WITH 3SD, Log+1): Median Average vs Reasonable",
         subtitle = paste("AIC =", round(AIC1Medians_3sd_plusone, 2), ", R² =", round(r2_model1_median_3sd_plusone, 3)),
         x = "Log (+1) Median Average Rating (3SD Removed)",
         y = "Log (+1) Median Reasonable Rating (3SD Removed)") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Ideal Median vs Reasonable Median
  plot2_median_3sd_plusone <- ggplot(dataforAICMedians_3sd_plusone_clean, aes(x = Ideal, y = Reasonable)) +
    geom_point(size = 3, alpha = 0.7) + geom_smooth(method = "lm", se = TRUE, color = "red") +
    labs(title = "Model 2 (WITH 3SD, Log+1): Median Ideal vs Reasonable",
         subtitle = paste("AIC =", round(AIC2Medians_3sd_plusone, 2), ", R² =", round(r2_model2_median_3sd_plusone, 3)),
         x = "Log (+1) Median Ideal Rating (3SD Removed)",
         y = "Log (+1) Median Reasonable Rating (3SD Removed)") +
    theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Average effect)
  new_data_median_3sd_plusone <- expand.grid( Average = seq(min(dataforAICMedians_3sd_plusone_clean$Average, na.rm=TRUE), max(dataforAICMedians_3sd_plusone_clean$Average, na.rm=TRUE), length.out = 100), Ideal = median(dataforAICMedians_3sd_plusone_clean$Ideal, na.rm=TRUE) )
  new_data_median_3sd_plusone <- new_data_median_3sd_plusone[is.finite(new_data_median_3sd_plusone$Average) & is.finite(new_data_median_3sd_plusone$Ideal),]
  if(nrow(new_data_median_3sd_plusone)>0){ new_data_median_3sd_plusone$predicted_reasonable <- predict(Model3Medians_3sd_plusone, newdata = new_data_median_3sd_plusone) } else { new_data_median_3sd_plusone <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric()) }
  plot3_median_3sd_plusone <- ggplot(dataforAICMedians_3sd_plusone_clean, aes(x = Average, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data_median_3sd_plusone, aes(y = predicted_reasonable), color = "blue", size = 1) + labs(title = "Model 3 (WITH 3SD, Log+1): Effect of Median Average", subtitle = paste("AIC =", round(AIC3Medians_3sd_plusone, 2), ", R² =", round(r2_model3_median_3sd_plusone, 3)), x = "Log (+1) Median Average Rating (3SD Removed)", y = "Log (+1) Median Reasonable Rating (3SD Removed)") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Model 3 (Ideal effect)
  new_data2_median_3sd_plusone <- expand.grid( Average = median(dataforAICMedians_3sd_plusone_clean$Average, na.rm=TRUE), Ideal = seq(min(dataforAICMedians_3sd_plusone_clean$Ideal, na.rm=TRUE), max(dataforAICMedians_3sd_plusone_clean$Ideal, na.rm=TRUE), length.out = 100) )
  new_data2_median_3sd_plusone <- new_data2_median_3sd_plusone[is.finite(new_data2_median_3sd_plusone$Average) & is.finite(new_data2_median_3sd_plusone$Ideal),]
  if(nrow(new_data2_median_3sd_plusone)>0){ new_data2_median_3sd_plusone$predicted_reasonable <- predict(Model3Medians_3sd_plusone, newdata = new_data2_median_3sd_plusone) } else { new_data2_median_3sd_plusone <- data.frame(Average=numeric(), Ideal=numeric(), predicted_reasonable=numeric()) }
  plot4_median_3sd_plusone <- ggplot(dataforAICMedians_3sd_plusone_clean, aes(x = Ideal, y = Reasonable)) + geom_point(size = 3, alpha = 0.5) + geom_line(data = new_data2_median_3sd_plusone, aes(y = predicted_reasonable), color = "red", size = 1) + labs(title = "Model 3 (WITH 3SD, Log+1): Effect of Median Ideal", subtitle = paste("AIC =", round(AIC3Medians_3sd_plusone, 2), ", R² =", round(r2_model3_median_3sd_plusone, 3)), x = "Log (+1) Median Ideal Rating (3SD Removed)", y = "Log (+1) Median Reasonable Rating (3SD Removed)") + theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9) )

  # Display plots
  print(plot1_median_3sd_plusone)
  print(plot2_median_3sd_plusone)
  print(plot3_median_3sd_plusone)
  print(plot4_median_3sd_plusone)

  # Save plots
  ggsave("model1_median_avg_reas_3sd_plusone.png", plot1_median_3sd_plusone, width = 8, height = 6, dpi = 300)
  ggsave("model2_median_ideal_reas_3sd_plusone.png", plot2_median_3sd_plusone, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_avg_eff_3sd_plusone.png", plot3_median_3sd_plusone, width = 8, height = 6, dpi = 300)
  ggsave("model3_median_ideal_eff_3sd_plusone.png", plot4_median_3sd_plusone, width = 8, height = 6, dpi = 300)
  grid_plots_median_3sd_plusone <- grid.arrange(plot1_median_3sd_plusone, plot2_median_3sd_plusone, plot3_median_3sd_plusone, plot4_median_3sd_plusone, ncol = 2)
  ggsave("all_median_plots_3sd_plusone.png", grid_plots_median_3sd_plusone, width = 14, height = 10, dpi = 300)

  # Correlation matrix
  cor_data_3sd_plusone <- dataforAICMedians_3sd_plusone_clean[complete.cases(dataforAICMedians_3sd_plusone_clean),]
  if(nrow(cor_data_3sd_plusone) > 1) {
   cor_matrix_3sd_plusone <- cor(cor_data_3sd_plusone, use = "complete.obs")
   corrplot(cor_matrix_3sd_plusone, method = "color", type = "upper", addCoef.col = "black", tl.col = "black", tl.srt = 45, diag = FALSE, title = "Correlation Matrix (WITH 3SD, Log Plus One)", mar = c(0,0,1,0))
  } else { cat("Not enough data for correlation matrix (WITH 3SD, Log Plus One).\n") }

  # Coefficient tables
  model1_coefs_3sd_plusone <- get_model_coefs(Model1Medians_3sd_plusone)
  model2_coefs_3sd_plusone <- get_model_coefs(Model2Medians_3sd_plusone)
  model3_coefs_3sd_plusone <- get_model_coefs(Model3Medians_3sd_plusone)
  model1_coefs_3sd_plusone$p_value <- sapply(model1_coefs_3sd_plusone$p_value, format_pvalue)
  model2_coefs_3sd_plusone$p_value <- sapply(model2_coefs_3sd_plusone$p_value, format_pvalue)
  model3_coefs_3sd_plusone$p_value <- sapply(model3_coefs_3sd_plusone$p_value, format_pvalue)
  kable(model1_coefs_3sd_plusone, caption = "Model 1 Coeffs (WITH 3SD, Log Plus One)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model2_coefs_3sd_plusone, caption = "Model 2 Coeffs (WITH 3SD, Log Plus One)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
  kable(model3_coefs_3sd_plusone, caption = "Model 3 Coeffs (WITH 3SD, Log Plus One)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)

  # Print model summaries
  cat("\nModel Summaries (WITH 3SD, Log Plus One):\n")
  print(summary(Model1Medians_3sd_plusone))
  print(summary(Model2Medians_3sd_plusone))
  print(summary(Model3Medians_3sd_plusone))

} else {
  cat("Skipping plots and summaries for (WITH 3SD, Log Plus One) due to insufficient data.\n")
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## 
## Model Summaries (WITH 3SD, Log Plus One):
## 
## Call:
## lm(formula = Reasonable ~ Average, data = dataforAICMedians_3sd_plusone_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0611 -0.3553 -0.1168  0.6285  1.3474 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.14186    0.30917   0.459     0.65    
## Average      0.80408    0.09458   8.501 1.33e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8484 on 31 degrees of freedom
## Multiple R-squared:  0.6998, Adjusted R-squared:  0.6901 
## F-statistic: 72.27 on 1 and 31 DF,  p-value: 1.331e-09
## 
## 
## Call:
## lm(formula = Reasonable ~ Ideal, data = dataforAICMedians_3sd_plusone_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6903 -0.3466 -0.1764  0.2491  1.6321 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.83447    0.16400   5.088 1.67e-05 ***
## Ideal        0.81779    0.06312  12.956 4.74e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6114 on 31 degrees of freedom
## Multiple R-squared:  0.8441, Adjusted R-squared:  0.8391 
## F-statistic: 167.9 on 1 and 31 DF,  p-value: 4.742e-14
## 
## 
## Call:
## lm(formula = Reasonable ~ Average + Ideal, data = dataforAICMedians_3sd_plusone_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60695 -0.21446 -0.02932  0.09051  1.15093 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.12470    0.12108   1.030    0.311    
## Average      0.41207    0.04759   8.659 1.17e-09 ***
## Ideal        0.57820    0.04407  13.120 5.81e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3322 on 30 degrees of freedom
## Multiple R-squared:  0.9555, Adjusted R-squared:  0.9525 
## F-statistic: 321.7 on 2 and 30 DF,  p-value: < 2.2e-16
# Store results
analysis_3_results_3sd_plusone <- list(
  medians_by_question = if(exists("medians_df_3sd")) medians_df_3sd else NULL,
  log_medians_by_question = if(exists("log_medians_df_3sd_plusone")) log_medians_df_3sd_plusone else NULL,
  regression_models = if(exists("Model1Medians_3sd_plusone")) list(model1 = Model1Medians_3sd_plusone, model2 = Model2Medians_3sd_plusone, model3 = Model3Medians_3sd_plusone) else NULL,
  regression_summary = if(exists("regression_summary_medians_3sd_plusone")) regression_summary_medians_3sd_plusone else NULL,
  aic_values = if(exists("AIC1Medians_3sd_plusone")) c(AIC1 = AIC1Medians_3sd_plusone, AIC2 = AIC2Medians_3sd_plusone, AIC3 = AIC3Medians_3sd_plusone) else NULL
)

Analysis 4 [NO 3 SD Outlier Removal]: Overall results, intermediacy with median ratings

Step 1 and Step 2 have been completed in the earlier analysis.

Functions for Analysis 4

# Function to check if reasonable is on the "ideal side" of average
# Equal median ratings are treated as being on the predicted side
is_ideal_side_median <- function(average, ideal, reasonable) {
  if (any(is.na(c(average, ideal, reasonable)))) {
    return(NA)
  }
  # Check if distance to ideal is less than or equal to distance to average
  ifelse(abs(reasonable - ideal) <= abs(reasonable - average), 1, 0)
}

# Function to check if reasonable is on the "average side" of ideal
# Equal median ratings are treated as being on the predicted side
is_average_side_median <- function(average, ideal, reasonable) {
  if (any(is.na(c(average, ideal, reasonable)))) {
      return(NA)
  }
  # Check if distance to average is less than or equal to distance to ideal
  ifelse(abs(reasonable - average) <= abs(reasonable - ideal), 1, 0)
}

# Function to check if reasonable falls between average and ideal (inclusive)
# Handles cases where average == ideal
is_both_sides_median <- function(average, ideal, reasonable) {
  if (any(is.na(c(average, ideal, reasonable)))) {
      return(NA)
  }
  # Check if reasonable is numerically between average and ideal, inclusive
  # This works regardless of whether average > ideal or ideal > average
  ifelse((reasonable >= min(average, ideal)) & (reasonable <= max(average, ideal)), 1, 0)
}

Steps 3, 4, 5: Intermediacy Calculations (NO 3SD Removal)

# --- Calculate all vectors ---
ideal_side_vector_median_no3sd <- mapply(is_ideal_side_median, column_medians_average_no3sd, column_medians_ideal_no3sd, column_medians_reasonable_no3sd)
average_side_vector_median_no3sd <- mapply(is_average_side_median, column_medians_average_no3sd, column_medians_ideal_no3sd, column_medians_reasonable_no3sd)
both_sides_vector_median_no3sd <- mapply(is_both_sides_median, column_medians_average_no3sd, column_medians_ideal_no3sd, column_medians_reasonable_no3sd)

# --- Calculate counts and valid Ns ---
count_ideal_side_median_no3sd <- sum(ideal_side_vector_median_no3sd, na.rm = TRUE)
valid_questions_ideal_no3sd <- sum(!is.na(ideal_side_vector_median_no3sd))
count_average_side_median_no3sd <- sum(average_side_vector_median_no3sd, na.rm = TRUE)
valid_questions_avg_no3sd <- sum(!is.na(average_side_vector_median_no3sd))
count_both_sides_median_no3sd <- sum(both_sides_vector_median_no3sd, na.rm = TRUE)
valid_questions_both_no3sd <- sum(!is.na(both_sides_vector_median_no3sd))

# --- Perform binomial tests ---
p_val_ideal_no3sd <- NA; binomial_result_ideal_median_no3sd <- NULL
if (valid_questions_ideal_no3sd > 0) {
  binomial_result_ideal_median_no3sd <- binom.test(count_ideal_side_median_no3sd, valid_questions_ideal_no3sd, p = 0.5, alternative = "greater")
  p_val_ideal_no3sd <- binomial_result_ideal_median_no3sd$p.value
}

p_val_avg_no3sd <- NA; binomial_result_avg_side_median_no3sd <- NULL
if (valid_questions_avg_no3sd > 0) {
  binomial_result_avg_side_median_no3sd <- binom.test(count_average_side_median_no3sd, valid_questions_avg_no3sd, p = 0.5, alternative = "greater")
   p_val_avg_no3sd <- binomial_result_avg_side_median_no3sd$p.value
}

p_val_both_no3sd <- NA; binomial_result_both_sides_median_no3sd <- NULL
if (valid_questions_both_no3sd > 0) {
  binomial_result_both_sides_median_no3sd <- binom.test(count_both_sides_median_no3sd, valid_questions_both_no3sd, p = 1/3, alternative = "two.sided")
  p_val_both_no3sd <- binomial_result_both_sides_median_no3sd$p.value
}

# --- Create ONE combined summary table ---
intermediacy_summary_no3sd <- data.frame(
  Test = c("On Ideal Side of Average", "On Average Side of Ideal", "Between Average and Ideal"),
  Count = c(count_ideal_side_median_no3sd, count_average_side_median_no3sd, count_both_sides_median_no3sd),
  Total_Valid = c(valid_questions_ideal_no3sd, valid_questions_avg_no3sd, valid_questions_both_no3sd),
  Proportion = c(ifelse(valid_questions_ideal_no3sd > 0, count_ideal_side_median_no3sd / valid_questions_ideal_no3sd, NA),
                 ifelse(valid_questions_avg_no3sd > 0, count_average_side_median_no3sd / valid_questions_avg_no3sd, NA),
                 ifelse(valid_questions_both_no3sd > 0, count_both_sides_median_no3sd / valid_questions_both_no3sd, NA)),
  Expected_Prop = c(0.5, 0.5, 1/3),
  p_value = c(p_val_ideal_no3sd, p_val_avg_no3sd, p_val_both_no3sd)
)

# Format proportions and p-values
intermediacy_summary_no3sd$Proportion <- scales::percent(intermediacy_summary_no3sd$Proportion, accuracy = 0.1)
intermediacy_summary_no3sd$Expected_Prop <- scales::percent(intermediacy_summary_no3sd$Expected_Prop, accuracy = 0.1)
intermediacy_summary_no3sd$p_value <- sapply(intermediacy_summary_no3sd$p_value, format_pvalue)

# Display the summary table
kable(intermediacy_summary_no3sd[,c("Test", "Count", "Total_Valid", "Proportion", "Expected_Prop", "p_value")],
      caption = "Intermediacy Binomial Tests (NO 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Intermediacy Binomial Tests (NO 3SD Removal)
Test Count Total_Valid Proportion Expected_Prop p_value
On Ideal Side of Average 29 33 87.9% 50.0% < .001
On Average Side of Ideal 8 33 24.2% 50.0% .999
Between Average and Ideal 31 33 93.9% 33.3% < .001

6. Plots and Reporting (Intermediacy - Median Analysis, NO 3SD Removed)

# Create a table showing the results for each question
question_position_summary_median_no3sd <- data.frame(
  Question = names(column_medians_average_no3sd),
  Average_Value = column_medians_average_no3sd,
  Ideal_Value = column_medians_ideal_no3sd,
  Reasonable_Value = column_medians_reasonable_no3sd,
  On_Ideal_Side = ideal_side_vector_median_no3sd,
  On_Average_Side = average_side_vector_median_no3sd,
  Between_Average_And_Ideal = both_sides_vector_median_no3sd
)
# Handle NAs and Categorize
question_position_summary_median_no3sd <- question_position_summary_median_no3sd %>%
  mutate(
    On_Ideal_Side = ifelse(is.na(On_Ideal_Side), 0, On_Ideal_Side),
    On_Average_Side = ifelse(is.na(On_Average_Side), 0, On_Average_Side),
    Between_Average_And_Ideal = ifelse(is.na(Between_Average_And_Ideal), 0, Between_Average_And_Ideal)
  )
question_position_summary_median_no3sd$Position <- "Other/NA"
question_position_summary_median_no3sd$Position[question_position_summary_median_no3sd$Between_Average_And_Ideal == 1] <- "Between Average and Ideal"
question_position_summary_median_no3sd$Position[question_position_summary_median_no3sd$Between_Average_And_Ideal == 0 & question_position_summary_median_no3sd$On_Ideal_Side == 1 & question_position_summary_median_no3sd$On_Average_Side == 0] <- "Beyond Ideal"
question_position_summary_median_no3sd$Position[question_position_summary_median_no3sd$Between_Average_And_Ideal == 0 & question_position_summary_median_no3sd$On_Average_Side == 1 & question_position_summary_median_no3sd$On_Ideal_Side == 0] <- "Beyond Average"
question_position_summary_median_no3sd$Position[
    !is.na(question_position_summary_median_no3sd$Average_Value) &
    !is.na(question_position_summary_median_no3sd$Ideal_Value) &
    question_position_summary_median_no3sd$Average_Value == question_position_summary_median_no3sd$Ideal_Value] <- "Average = Ideal"

# Display the question position summary
kable(question_position_summary_median_no3sd[, c("Question", "Average_Value", "Ideal_Value", "Reasonable_Value", "Position")],
      caption = "Position of Median Reasonable Rating (NO 3SD Removal) Relative to Average and Ideal") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Position of Median Reasonable Rating (NO 3SD Removal) Relative to Average and Ideal
Question Average_Value Ideal_Value Reasonable_Value Position
q1 q1 3.0 2.0 2.0 Between Average and Ideal
q2 q2 7.0 2.0 4.0 Between Average and Ideal
q3 q3 3.0 6.0 5.0 Between Average and Ideal
q4 q4 2000.0 2000.0 2000.0 Average = Ideal
q5 q5 20.0 30.0 30.0 Between Average and Ideal
q6 q6 10.0 1.0 3.0 Between Average and Ideal
q7 q7 15.0 5.0 10.0 Between Average and Ideal
q8 q8 5.0 12.0 10.0 Between Average and Ideal
q9 q9 5.0 3.0 5.0 Between Average and Ideal
q10 q10 5.0 0.0 2.0 Between Average and Ideal
q11 q11 500.0 0.0 10.0 Between Average and Ideal
q12 q12 30.0 1.0 10.0 Between Average and Ideal
q13 q13 40.0 10.0 15.0 Between Average and Ideal
q14 q14 10.0 2.0 5.0 Between Average and Ideal
q15 q15 9.0 10.0 8.0 Beyond Average
q16 q16 5.0 5.0 4.0 Average = Ideal
q17 q17 3.0 0.0 1.0 Between Average and Ideal
q18 q18 15.0 2.0 9.5 Between Average and Ideal
q20 q20 20.0 0.0 4.0 Between Average and Ideal
q21 q21 8.0 5.0 5.0 Between Average and Ideal
q22 q22 10.0 7.0 7.0 Between Average and Ideal
q23 q23 2.0 3.0 3.0 Between Average and Ideal
q24 q24 16.0 24.0 24.0 Between Average and Ideal
q25 q25 2000.0 500.0 1000.0 Between Average and Ideal
q26 q26 8.0 2.0 4.0 Between Average and Ideal
q27 q27 12.0 5.0 6.0 Between Average and Ideal
q28 q28 10.0 20.0 20.0 Between Average and Ideal
q29 q29 55.0 90.0 80.0 Between Average and Ideal
q30 q30 12.0 4.0 4.0 Between Average and Ideal
q31 q31 50.0 13.0 25.0 Between Average and Ideal
q32 q32 24.0 48.0 40.0 Between Average and Ideal
q33 q33 5.5 2.5 4.0 Between Average and Ideal
q34 q34 50.0 10.0 50.0 Between Average and Ideal
# Count the number of questions in each position category
position_counts_median_no3sd <- table(question_position_summary_median_no3sd$Position)
position_summary_median_no3sd <- data.frame(Position = names(position_counts_median_no3sd), Count = as.numeric(position_counts_median_no3sd))
total_categorized_no3sd <- sum(position_summary_median_no3sd$Count)
position_summary_median_no3sd$Percentage <- ifelse(total_categorized_no3sd > 0, (position_summary_median_no3sd$Count / total_categorized_no3sd) * 100, 0)
position_summary_median_no3sd$Percentage <- round(position_summary_median_no3sd$Percentage, 1)

# Display the position count summary
kable(position_summary_median_no3sd, caption = "Distribution of Position Categories (Median Values, NO 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Distribution of Position Categories (Median Values, NO 3SD Removal)
Position Count Percentage
Average = Ideal 2 6.1
Between Average and Ideal 30 6.1
Beyond Average 1 6.1
# Create distribution plot
position_distribution_plot_median_no3sd <- ggplot(position_summary_median_no3sd, aes(x = Position, y = Count, fill = Position)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(Count)), vjust = -0.5, size = 4) +
  labs(title = "Distribution of Position Categories (Median Values, NO 3SD Removal)",
       subtitle = "Where does 'Reasonable' fall relative to 'Average' and 'Ideal'?",
       x = "Position Category", y = "Number of Questions") +
  theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9, angle = 15, hjust = 1), legend.position = "none")
print(position_distribution_plot_median_no3sd)

ggsave("position_distribution_median_no3sd_removed.png", position_distribution_plot_median_no3sd, width = 10, height = 7, dpi = 300)

# Create scatterplot
plot_data_intermediacy_no3sd <- question_position_summary_median_no3sd %>% filter(!is.na(Average_Value) & !is.na(Ideal_Value) & !is.na(Reasonable_Value))
prop_both_no3sd_fmt <- scales::percent(ifelse(valid_questions_both_no3sd > 0, count_both_sides_median_no3sd/valid_questions_both_no3sd, 0), accuracy=0.1)
intermediacy_plot_median_no3sd <- ggplot(plot_data_intermediacy_no3sd, aes(x = Average_Value, y = Ideal_Value)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", alpha = 0.5) +
  geom_point(color = "gray60", size = 3, alpha = 0.6) +
  geom_point(aes(x = Average_Value, y = Reasonable_Value, color = Position), size = 4) +
  geom_segment(aes(x = Average_Value, xend = Average_Value, y = Ideal_Value, yend = Reasonable_Value, color = Position), linetype = "dotted", size = 0.5, alpha = 0.7) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Intermediacy Analysis (Median, NO 3SD Removal): Position of Reasonable",
       subtitle = paste0("Between Avg and Ideal: ", count_both_sides_median_no3sd, " of ", valid_questions_both_no3sd, " (", prop_both_no3sd_fmt, ")"),
       x = "Median Average Rating (NO 3SD Removal)", y = "Median Rating Value (NO 3SD Removal)", color = "Position Category") +
  theme_minimal() + theme(plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9), legend.position = "bottom")
print(intermediacy_plot_median_no3sd)

ggsave("intermediacy_analysis_median_no3sd_removed.png", intermediacy_plot_median_no3sd, width = 10, height = 8, dpi = 300)

# Create rating patterns plot
ratings_long_median_no3sd <- question_position_summary_median_no3sd %>%
  select(Question, Average_Value, Ideal_Value, Reasonable_Value, Position) %>%
  pivot_longer(cols = c(Average_Value, Ideal_Value, Reasonable_Value), names_to = "Rating_Type", values_to = "Value") %>%
  mutate(Rating_Type = factor(gsub("_Value", "", Rating_Type), levels = c("Average", "Reasonable", "Ideal"))) %>%
  filter(!is.na(Value))
rating_patterns_plot_median_no3sd <- ggplot(ratings_long_median_no3sd, aes(x = Rating_Type, y = Value, group = Question, color = Position)) +
  geom_line(alpha = 0.5) + geom_point(size = 2.5, alpha=0.7) + facet_wrap(~ Position, scales = "free_y") +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Rating Patterns by Position Category (Median Values, NO 3SD Removal)",
       subtitle = "How Average, Ideal, and Reasonable median ratings relate within each category",
       x = "Rating Type", y = "Median Value (NO 3SD Removal)") +
  theme_minimal() + theme(plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text.x = element_text(size = 9, angle = 45, hjust = 1), axis.text.y = element_text(size = 9), strip.text = element_text(size=10, face="bold"), legend.position = "none")
print(rating_patterns_plot_median_no3sd)

ggsave("rating_patterns_by_position_median_no3sd_removed.png", rating_patterns_plot_median_no3sd, width = 12, height = 8, dpi = 300)

# Store results
analysis_4_results_no3sd <- list(
  ideal_side = list(vector=ideal_side_vector_median_no3sd, count=count_ideal_side_median_no3sd, valid_n=valid_questions_ideal_no3sd, binomial_test=binomial_result_ideal_median_no3sd, summary_row=intermediacy_summary_no3sd[1,]),
  average_side = list(vector=average_side_vector_median_no3sd, count=count_average_side_median_no3sd, valid_n=valid_questions_avg_no3sd, binomial_test=binomial_result_avg_side_median_no3sd, summary_row=intermediacy_summary_no3sd[2,]),
  both_sides = list(vector=both_sides_vector_median_no3sd, count=count_both_sides_median_no3sd, valid_n=valid_questions_both_no3sd, binomial_test=binomial_result_both_sides_median_no3sd, summary_row=intermediacy_summary_no3sd[3,]),
  combined_summary = intermediacy_summary_no3sd,
  question_position = question_position_summary_median_no3sd,
  position_summary = position_summary_median_no3sd
)

Analysis 4 [WITH 3 SD Outlier Removal]: Overall results, intermediacy with median ratings

Step 1 and Step 2 have been completed in the earlier analysis.

Functions for Analysis 4

# Function to check if reasonable is on the "ideal side" of average
# Equal median ratings are treated as being on the predicted side
is_ideal_side_median <- function(average, ideal, reasonable) {
  if (any(is.na(c(average, ideal, reasonable)))) {
    return(NA)
  }
  # Check if distance to ideal is less than or equal to distance to average
  ifelse(abs(reasonable - ideal) <= abs(reasonable - average), 1, 0)
}

# Function to check if reasonable is on the "average side" of ideal
# Equal median ratings are treated as being on the predicted side
is_average_side_median <- function(average, ideal, reasonable) {
  if (any(is.na(c(average, ideal, reasonable)))) {
      return(NA)
  }
  # Check if distance to average is less than or equal to distance to ideal
  ifelse(abs(reasonable - average) <= abs(reasonable - ideal), 1, 0)
}

# Function to check if reasonable falls between average and ideal (inclusive)
# Handles cases where average == ideal
is_both_sides_median <- function(average, ideal, reasonable) {
  if (any(is.na(c(average, ideal, reasonable)))) {
      return(NA)
  }
  # Check if reasonable is numerically between average and ideal, inclusive
  # This works regardless of whether average > ideal or ideal > average
  ifelse((reasonable >= min(average, ideal)) & (reasonable <= max(average, ideal)), 1, 0)
}

Steps 3, 4, 5: Intermediacy Calculations (WITH 3SD Removal)

# --- Calculate all vectors ---
ideal_side_vector_median_3sd <- mapply(is_ideal_side_median, column_medians_average_3sd, column_medians_ideal_3sd, column_medians_reasonable_3sd)
average_side_vector_median_3sd <- mapply(is_average_side_median, column_medians_average_3sd, column_medians_ideal_3sd, column_medians_reasonable_3sd)
both_sides_vector_median_3sd <- mapply(is_both_sides_median, column_medians_average_3sd, column_medians_ideal_3sd, column_medians_reasonable_3sd)

# --- Calculate counts and valid Ns ---
count_ideal_side_median_3sd <- sum(ideal_side_vector_median_3sd, na.rm = TRUE)
valid_questions_ideal_3sd <- sum(!is.na(ideal_side_vector_median_3sd))
count_average_side_median_3sd <- sum(average_side_vector_median_3sd, na.rm = TRUE)
valid_questions_avg_3sd <- sum(!is.na(average_side_vector_median_3sd))
count_both_sides_median_3sd <- sum(both_sides_vector_median_3sd, na.rm = TRUE)
valid_questions_both_3sd <- sum(!is.na(both_sides_vector_median_3sd))

# --- Perform binomial tests ---
p_val_ideal_3sd <- NA; binomial_result_ideal_median_3sd <- NULL
if (valid_questions_ideal_3sd > 0) {
  binomial_result_ideal_median_3sd <- binom.test(count_ideal_side_median_3sd, valid_questions_ideal_3sd, p = 0.5, alternative = "greater")
  p_val_ideal_3sd <- binomial_result_ideal_median_3sd$p.value
}

p_val_avg_3sd <- NA; binomial_result_avg_side_median_3sd <- NULL
if (valid_questions_avg_3sd > 0) {
  binomial_result_avg_side_median_3sd <- binom.test(count_average_side_median_3sd, valid_questions_avg_3sd, p = 0.5, alternative = "greater")
   p_val_avg_3sd <- binomial_result_avg_side_median_3sd$p.value
}

p_val_both_3sd <- NA; binomial_result_both_sides_median_3sd <- NULL
if (valid_questions_both_3sd > 0) {
  binomial_result_both_sides_median_3sd <- binom.test(count_both_sides_median_3sd, valid_questions_both_3sd, p = 1/3, alternative = "two.sided")
  p_val_both_3sd <- binomial_result_both_sides_median_3sd$p.value
}

# --- Create ONE combined summary table ---
intermediacy_summary_3sd <- data.frame(
  Test = c("On Ideal Side of Average", "On Average Side of Ideal", "Between Average and Ideal"),
  Count = c(count_ideal_side_median_3sd, count_average_side_median_3sd, count_both_sides_median_3sd),
  Total_Valid = c(valid_questions_ideal_3sd, valid_questions_avg_3sd, valid_questions_both_3sd),
  Proportion = c(ifelse(valid_questions_ideal_3sd > 0, count_ideal_side_median_3sd / valid_questions_ideal_3sd, NA),
                 ifelse(valid_questions_avg_3sd > 0, count_average_side_median_3sd / valid_questions_avg_3sd, NA),
                 ifelse(valid_questions_both_3sd > 0, count_both_sides_median_3sd / valid_questions_both_3sd, NA)),
  Expected_Prop = c(0.5, 0.5, 1/3),
  p_value = c(p_val_ideal_3sd, p_val_avg_3sd, p_val_both_3sd)
)

# Format proportions and p-values
intermediacy_summary_3sd$Proportion <- scales::percent(intermediacy_summary_3sd$Proportion, accuracy = 0.1)
intermediacy_summary_3sd$Expected_Prop <- scales::percent(intermediacy_summary_3sd$Expected_Prop, accuracy = 0.1)
intermediacy_summary_3sd$p_value <- sapply(intermediacy_summary_3sd$p_value, format_pvalue)

# Display the summary table
kable(intermediacy_summary_3sd[,c("Test", "Count", "Total_Valid", "Proportion", "Expected_Prop", "p_value")],
      caption = "Intermediacy Binomial Tests (WITH 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Intermediacy Binomial Tests (WITH 3SD Removal)
Test Count Total_Valid Proportion Expected_Prop p_value
On Ideal Side of Average 29 33 87.9% 50.0% < .001
On Average Side of Ideal 8 33 24.2% 50.0% .999
Between Average and Ideal 32 33 97.0% 33.3% < .001

6. Plots and Reporting (Intermediacy - Median Analysis, WITH 3SD Removed)

# Create a table showing the results for each question
question_position_summary_median_3sd <- data.frame(
  Question = names(column_medians_average_3sd),
  Average_Value = column_medians_average_3sd,
  Ideal_Value = column_medians_ideal_3sd,
  Reasonable_Value = column_medians_reasonable_3sd,
  On_Ideal_Side = ideal_side_vector_median_3sd,
  On_Average_Side = average_side_vector_median_3sd,
  Between_Average_And_Ideal = both_sides_vector_median_3sd
)
# Handle NAs and Categorize
question_position_summary_median_3sd <- question_position_summary_median_3sd %>%
  mutate(
    On_Ideal_Side = ifelse(is.na(On_Ideal_Side), 0, On_Ideal_Side),
    On_Average_Side = ifelse(is.na(On_Average_Side), 0, On_Average_Side),
    Between_Average_And_Ideal = ifelse(is.na(Between_Average_And_Ideal), 0, Between_Average_And_Ideal)
  )
question_position_summary_median_3sd$Position <- "Other/NA"
question_position_summary_median_3sd$Position[question_position_summary_median_3sd$Between_Average_And_Ideal == 1] <- "Between Average and Ideal"
question_position_summary_median_3sd$Position[question_position_summary_median_3sd$Between_Average_And_Ideal == 0 & question_position_summary_median_3sd$On_Ideal_Side == 1 & question_position_summary_median_3sd$On_Average_Side == 0] <- "Beyond Ideal"
question_position_summary_median_3sd$Position[question_position_summary_median_3sd$Between_Average_And_Ideal == 0 & question_position_summary_median_3sd$On_Average_Side == 1 & question_position_summary_median_3sd$On_Ideal_Side == 0] <- "Beyond Average"
question_position_summary_median_3sd$Position[
    !is.na(question_position_summary_median_3sd$Average_Value) &
    !is.na(question_position_summary_median_3sd$Ideal_Value) &
    question_position_summary_median_3sd$Average_Value == question_position_summary_median_3sd$Ideal_Value] <- "Average = Ideal"

# Display the question position summary
kable(question_position_summary_median_3sd[, c("Question", "Average_Value", "Ideal_Value", "Reasonable_Value", "Position")],
      caption = "Position of Median Reasonable Rating (WITH 3SD Removal) Relative to Average and Ideal") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Position of Median Reasonable Rating (WITH 3SD Removal) Relative to Average and Ideal
Question Average_Value Ideal_Value Reasonable_Value Position
q1 q1 3.0 2 2 Between Average and Ideal
q2 q2 7.0 2 3 Between Average and Ideal
q3 q3 3.0 6 5 Between Average and Ideal
q4 q4 2000.0 2000 2000 Average = Ideal
q5 q5 20.0 30 30 Between Average and Ideal
q6 q6 10.0 1 3 Between Average and Ideal
q7 q7 15.0 5 10 Between Average and Ideal
q8 q8 5.0 12 10 Between Average and Ideal
q9 q9 5.0 3 5 Between Average and Ideal
q10 q10 5.0 0 2 Between Average and Ideal
q11 q11 500.0 0 7 Between Average and Ideal
q12 q12 30.0 0 10 Between Average and Ideal
q13 q13 40.0 10 15 Between Average and Ideal
q14 q14 10.0 2 5 Between Average and Ideal
q15 q15 8.0 10 8 Between Average and Ideal
q16 q16 4.0 5 4 Between Average and Ideal
q17 q17 3.0 0 1 Between Average and Ideal
q18 q18 10.0 1 5 Between Average and Ideal
q20 q20 20.0 0 2 Between Average and Ideal
q21 q21 8.0 5 5 Between Average and Ideal
q22 q22 7.0 7 7 Average = Ideal
q23 q23 2.0 2 3 Average = Ideal
q24 q24 15.0 24 24 Between Average and Ideal
q25 q25 2000.0 500 1000 Between Average and Ideal
q26 q26 8.0 2 4 Between Average and Ideal
q27 q27 11.5 4 6 Between Average and Ideal
q28 q28 10.0 20 20 Between Average and Ideal
q29 q29 54.5 90 80 Between Average and Ideal
q30 q30 12.0 3 4 Between Average and Ideal
q31 q31 50.0 10 20 Between Average and Ideal
q32 q32 24.0 48 40 Between Average and Ideal
q33 q33 5.0 2 3 Between Average and Ideal
q34 q34 50.0 5 50 Between Average and Ideal
# Count the number of questions in each position category
position_counts_median_3sd <- table(question_position_summary_median_3sd$Position)
position_summary_median_3sd <- data.frame(Position = names(position_counts_median_3sd), Count = as.numeric(position_counts_median_3sd))
total_categorized_3sd <- sum(position_summary_median_3sd$Count)
position_summary_median_3sd$Percentage <- ifelse(total_categorized_3sd > 0, (position_summary_median_3sd$Count / total_categorized_3sd) * 100, 0)
position_summary_median_3sd$Percentage <- round(position_summary_median_3sd$Percentage, 1)

# Display the position count summary
kable(position_summary_median_3sd, caption = "Distribution of Position Categories (Median Values, WITH 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
Distribution of Position Categories (Median Values, WITH 3SD Removal)
Position Count Percentage
Average = Ideal 3 9.1
Between Average and Ideal 30 9.1
# Create distribution plot
position_distribution_plot_median_3sd <- ggplot(position_summary_median_3sd, aes(x = Position, y = Count, fill = Position)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(Count)), vjust = -0.5, size = 4) +
  labs(title = "Distribution of Position Categories (Median Values, WITH 3SD Removal)",
       subtitle = "Where does 'Reasonable' fall relative to 'Average' and 'Ideal'?",
       x = "Position Category", y = "Number of Questions") +
  theme_minimal() + theme( plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9, angle = 15, hjust = 1), legend.position = "none")
print(position_distribution_plot_median_3sd)

ggsave("position_distribution_median_with_3sd_removed.png", position_distribution_plot_median_3sd, width = 10, height = 7, dpi = 300)

# Create scatterplot
plot_data_intermediacy_3sd <- question_position_summary_median_3sd %>% filter(!is.na(Average_Value) & !is.na(Ideal_Value) & !is.na(Reasonable_Value))
prop_both_3sd_fmt <- scales::percent(ifelse(valid_questions_both_3sd > 0, count_both_sides_median_3sd/valid_questions_both_3sd, 0), accuracy=0.1)
intermediacy_plot_median_3sd <- ggplot(plot_data_intermediacy_3sd, aes(x = Average_Value, y = Ideal_Value)) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", alpha = 0.5) +
  geom_point(color = "gray60", size = 3, alpha = 0.6) +
  geom_point(aes(x = Average_Value, y = Reasonable_Value, color = Position), size = 4) +
  geom_segment(aes(x = Average_Value, xend = Average_Value, y = Ideal_Value, yend = Reasonable_Value, color = Position), linetype = "dotted", size = 0.5, alpha = 0.7) +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Intermediacy Analysis (Median, WITH 3SD Removal): Position of Reasonable",
       subtitle = paste0("Between Avg and Ideal: ", count_both_sides_median_3sd, " of ", valid_questions_both_3sd, " (", prop_both_3sd_fmt, ")"),
       x = "Median Average Rating (WITH 3SD Removal)", y = "Median Rating Value (WITH 3SD Removal)", color = "Position Category") +
  theme_minimal() + theme(plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text = element_text(size = 9), legend.position = "bottom")
print(intermediacy_plot_median_3sd)

ggsave("intermediacy_analysis_median_with_3sd_removed.png", intermediacy_plot_median_3sd, width = 10, height = 8, dpi = 300)

# Create rating patterns plot
ratings_long_median_3sd <- question_position_summary_median_3sd %>%
  select(Question, Average_Value, Ideal_Value, Reasonable_Value, Position) %>%
  pivot_longer(cols = c(Average_Value, Ideal_Value, Reasonable_Value), names_to = "Rating_Type", values_to = "Value") %>%
  mutate(Rating_Type = factor(gsub("_Value", "", Rating_Type), levels = c("Average", "Reasonable", "Ideal"))) %>%
  filter(!is.na(Value))
rating_patterns_plot_median_3sd <- ggplot(ratings_long_median_3sd, aes(x = Rating_Type, y = Value, group = Question, color = Position)) +
  geom_line(alpha = 0.5) + geom_point(size = 2.5, alpha=0.7) + facet_wrap(~ Position, scales = "free_y") +
  scale_color_brewer(palette = "Set1") +
  labs(title = "Rating Patterns by Position Category (Median Values, WITH 3SD Removal)",
       subtitle = "How Average, Ideal, and Reasonable median ratings relate within each category",
       x = "Rating Type", y = "Median Value (WITH 3SD Removal)") +
  theme_minimal() + theme(plot.title = element_text(size = 12, face = "bold"), plot.subtitle = element_text(size = 11), axis.title = element_text(size = 10), axis.text.x = element_text(size = 9, angle = 45, hjust = 1), axis.text.y = element_text(size = 9), strip.text = element_text(size=10, face="bold"), legend.position = "none")
print(rating_patterns_plot_median_3sd)

ggsave("rating_patterns_by_position_median_with_3sd_removed.png", rating_patterns_plot_median_3sd, width = 12, height = 8, dpi = 300)

# Store results
analysis_4_results_3sd <- list(
  ideal_side = list(vector=ideal_side_vector_median_3sd, count=count_ideal_side_median_3sd, valid_n=valid_questions_ideal_3sd, binomial_test=binomial_result_ideal_median_3sd, summary_row=intermediacy_summary_3sd[1,]),
  average_side = list(vector=average_side_vector_median_3sd, count=count_average_side_median_3sd, valid_n=valid_questions_avg_3sd, binomial_test=binomial_result_avg_side_median_3sd, summary_row=intermediacy_summary_3sd[2,]),
  both_sides = list(vector=both_sides_vector_median_3sd, count=count_both_sides_median_3sd, valid_n=valid_questions_both_3sd, binomial_test=binomial_result_both_sides_median_3sd, summary_row=intermediacy_summary_3sd[3,]),
  combined_summary = intermediacy_summary_3sd,
  question_position = question_position_summary_median_3sd,
  position_summary = position_summary_median_3sd
)

Analysis 7 [NO 3 SD Removed]

# Get unique country names
countries <- unique(Data_All$Country_name)
print(countries)
##  [1] "USA"         "Colombia"    "Brazil"      "Germany"     "India"      
##  [6] "Lithuania"   "Italy"       "Poland"      "Spain"       "Netherlands"

7a (Small constant inf values): Country Regressions (NO 3SD Removal)

# Create storage
country_results_median_no3sd_smallval <- list()

# Function to analyze median data for a single country (NO 3SD, Small Constant Log)
analyze_country_median_no3sd_smallval <- function(country, data_avg, data_ideal, data_reas, q_cols) {
  cat("\n========== Analyzing Median Data (NO 3SD, Log Small Const) for:", country, "==========\n")

  # Filter data by country AND apply attention check (using _Pass1 data)
  country_average_pass <- data_avg %>% filter(Country_name == country, q19 == 15)
  country_ideal_pass <- data_ideal %>% filter(Country_name == country, q19 == 15)
  country_reasonable_pass <- data_reas %>% filter(Country_name == country, q19 == 15)

  n_avg <- nrow(country_average_pass)
  n_ideal <- nrow(country_ideal_pass)
  n_reas <- nrow(country_reasonable_pass)

  cat("Participants after attention check:", n_avg, n_ideal, n_reas, "\n")

  if(n_avg < 1 || n_ideal < 1 || n_reas < 1) {
      cat("Insufficient participants for", country, "\n"); return(NULL)
  }

   convert_to_numeric <- function(df, cols) { df %>% mutate(across(all_of(cols), ~ suppressWarnings(as.numeric(as.character(.))))) }
   country_average_pass <- convert_to_numeric(country_average_pass, q_cols)
   country_ideal_pass <- convert_to_numeric(country_ideal_pass, q_cols)
   country_reasonable_pass <- convert_to_numeric(country_reasonable_pass, q_cols)

  # Calculate medians
  medians_avg <- if(n_avg > 0) sapply(country_average_pass[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_ideal <- if(n_ideal > 0) sapply(country_ideal_pass[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_reas <- if(n_reas > 0) sapply(country_reasonable_pass[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  names(medians_avg) <- q_cols; names(medians_ideal) <- q_cols; names(medians_reas) <- q_cols

  # Convert to log scale (Small Constant)
  log_medians_avg <- log_smallvalue(medians_avg)
  log_medians_ideal <- log_smallvalue(medians_ideal)
  log_medians_reas <- log_smallvalue(medians_reas)

  # Data frame for regression
  data_for_aic <- data.frame(Reasonable = log_medians_reas, Average = log_medians_avg, Ideal = log_medians_ideal)
  data_for_aic_clean <- data_for_aic %>% filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

  # Check for sufficient data points for regression
  if(nrow(data_for_aic_clean) < 5) {
    cat("Too few valid data points for regression analysis in", country, "\n")
    return(list(country=country, sample_sizes=list(after_attention=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_summary=NULL, best_model="Insufficient Data"))
  }

  # Run Regressions
  model1 <- lm(Reasonable ~ Average, data = data_for_aic_clean)
  aic1 <- AIC(model1); r2_1 <- summary(model1)$r.squared
  model2 <- lm(Reasonable ~ Ideal, data = data_for_aic_clean)
  aic2 <- AIC(model2); r2_2 <- summary(model2)$r.squared
  model3 <- lm(Reasonable ~ Average + Ideal, data = data_for_aic_clean)
  aic3 <- AIC(model3); r2_3 <- summary(model3)$r.squared

  # Regression summary
  reg_summary <- data.frame(Model=c("Avg", "Ideal", "Both"), AIC=c(aic1, aic2, aic3), R_squared=c(r2_1, r2_2, r2_3))
  best_model_idx <- which.min(c(aic1, aic2, aic3))
  best_model_name <- c("Average", "Ideal", "Hybrid")[best_model_idx]

  cat("Best model (NO 3SD, Log Small Const):", best_model_name, "\n")

  return(list(country=country, sample_sizes=list(after_attention=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_models=list(model1=model1, model2=model2, model3=model3), regression_summary=reg_summary, aic_values=c(Model1=aic1, Model2=aic2, Model3=aic3), best_model=best_model_name))
}

# Run analysis for each country
for(country in countries) {
  country_results_median_no3sd_smallval[[country]] <- analyze_country_median_no3sd_smallval(country, Data_All_Average_Pass1, Data_All_Ideal_Pass1, Data_All_Reasonable_Pass1, question_cols)
}
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: USA ==========
## Participants after attention check: 73 72 69 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Colombia ==========
## Participants after attention check: 89 84 81 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Brazil ==========
## Participants after attention check: 43 56 52 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Germany ==========
## Participants after attention check: 71 72 72 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: India ==========
## Participants after attention check: 74 71 56 
## Best model (NO 3SD, Log Small Const): Ideal 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Lithuania ==========
## Participants after attention check: 76 76 77 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Italy ==========
## Participants after attention check: 73 83 75 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Poland ==========
## Participants after attention check: 90 87 90 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Spain ==========
## Participants after attention check: 73 69 72 
## Best model (NO 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log Small Const) for: Netherlands ==========
## Participants after attention check: 131 130 114 
## Best model (NO 3SD, Log Small Const): Hybrid
# Extract AIC summary
country_aic_summary_median_no3sd_smallval <- data.frame( Country = character(), AIC_Average = numeric(), AIC_Ideal = numeric(), AIC_Hybrid = numeric(), R2_Average = numeric(), R2_Ideal = numeric(), R2_Hybrid = numeric(), Best_Model = character(), N_Avg = integer(), N_Ideal = integer(), N_Reas = integer(), stringsAsFactors = FALSE )
for(country in countries) {
  res <- country_results_median_no3sd_smallval[[country]]
  if(!is.null(res)) {
      country_aic_summary_median_no3sd_smallval <- rbind(country_aic_summary_median_no3sd_smallval, data.frame( Country = country, AIC_Average = ifelse(!is.null(res$aic_values), res$aic_values["Model1"], NA), AIC_Ideal = ifelse(!is.null(res$aic_values), res$aic_values["Model2"], NA), AIC_Hybrid = ifelse(!is.null(res$aic_values), res$aic_values["Model3"], NA), R2_Average = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[1], NA), R2_Ideal = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[2], NA), R2_Hybrid = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[3], NA), Best_Model = res$best_model, N_Avg = res$sample_sizes$after_attention[1], N_Ideal = res$sample_sizes$after_attention[2], N_Reas = res$sample_sizes$after_attention[3], stringsAsFactors = FALSE))
  }
}

# Display summary table - THIS WILL RENDER CORRECTLY IN KNITTED HTML
kable(country_aic_summary_median_no3sd_smallval, caption = "AIC/R² by Country (Median, NO 3SD, Log Small Constant)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(8, background = case_when( is.na(country_aic_summary_median_no3sd_smallval$Best_Model) ~ "white", country_aic_summary_median_no3sd_smallval$Best_Model == "Hybrid" ~ "#e6f7ff", country_aic_summary_median_no3sd_smallval$Best_Model == "Average" ~ "#e6ffe6", country_aic_summary_median_no3sd_smallval$Best_Model == "Ideal" ~ "#ffe6e6", TRUE ~ "#f2f2f2" ))
AIC/R² by Country (Median, NO 3SD, Log Small Constant)
Country AIC_Average AIC_Ideal AIC_Hybrid R2_Average R2_Ideal R2_Hybrid Best_Model N_Avg N_Ideal N_Reas
USA 77.422 115.073 45.067 0.799 0.372 0.929 Hybrid 73 72 69
Colombia 59.425 65.683 25.018 0.877 0.852 0.959 Hybrid 89 84 81
Brazil 72.292 118.221 61.027 0.817 0.263 0.877 Hybrid 43 56 52
Germany 70.116 117.835 47.873 0.846 0.348 0.926 Hybrid 71 72 72
India 34.665 26.444 28.339 0.725 0.786 0.786 Ideal 74 71 56
Lithuania 80.975 123.127 57.232 0.843 0.437 0.928 Hybrid 76 76 77
Italy 135.084 128.639 117.402 0.321 0.441 0.626 Hybrid 73 83 75
Poland 154.103 140.355 134.872 0.276 0.523 0.620 Hybrid 90 87 90
Spain 72.224 120.854 46.453 0.825 0.237 0.925 Hybrid 73 69 72
Netherlands 71.075 114.339 25.813 0.858 0.472 0.966 Hybrid 131 130 114

7b (Plus one to all values): Country Regressions (NO 3SD Removal)

# Create storage
country_results_median_no3sd_plusone <- list()

# Function to analyze median data for a single country (NO 3SD, Plus One Log)
analyze_country_median_no3sd_plusone <- function(country, data_avg, data_ideal, data_reas, q_cols) {
  cat("\n========== Analyzing Median Data (NO 3SD, Log+1) for:", country, "==========\n")

  # Filter data by country AND apply attention check (using _Pass1 data)
  country_average_pass <- data_avg %>% filter(Country_name == country, q19 == 15)
  country_ideal_pass <- data_ideal %>% filter(Country_name == country, q19 == 15)
  country_reasonable_pass <- data_reas %>% filter(Country_name == country, q19 == 15)

  n_avg <- nrow(country_average_pass)
  n_ideal <- nrow(country_ideal_pass)
  n_reas <- nrow(country_reasonable_pass)

  cat("Participants after attention check:", n_avg, n_ideal, n_reas, "\n")

  if(n_avg < 1 || n_ideal < 1 || n_reas < 1) {
      cat("Insufficient participants for", country, "\n"); return(NULL)
  }

   convert_to_numeric <- function(df, cols) { df %>% mutate(across(all_of(cols), ~ suppressWarnings(as.numeric(as.character(.))))) }
   country_average_pass <- convert_to_numeric(country_average_pass, q_cols)
   country_ideal_pass <- convert_to_numeric(country_ideal_pass, q_cols)
   country_reasonable_pass <- convert_to_numeric(country_reasonable_pass, q_cols)

  # Calculate medians
  medians_avg <- if(n_avg > 0) sapply(country_average_pass[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_ideal <- if(n_ideal > 0) sapply(country_ideal_pass[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_reas <- if(n_reas > 0) sapply(country_reasonable_pass[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  names(medians_avg) <- q_cols; names(medians_ideal) <- q_cols; names(medians_reas) <- q_cols

  # Convert to log scale (Plus One)
  log_medians_avg <- log_plus_one(medians_avg)
  log_medians_ideal <- log_plus_one(medians_ideal)
  log_medians_reas <- log_plus_one(medians_reas)

  # Data frame for regression
  data_for_aic <- data.frame(Reasonable = log_medians_reas, Average = log_medians_avg, Ideal = log_medians_ideal)
  data_for_aic_clean <- data_for_aic %>% filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

  # Check for sufficient data points for regression
  if(nrow(data_for_aic_clean) < 5) {
    cat("Too few valid data points for regression analysis in", country, "\n")
    return(list(country=country, sample_sizes=list(after_attention=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_summary=NULL, best_model="Insufficient Data"))
  }

  # Run Regressions
  model1 <- lm(Reasonable ~ Average, data = data_for_aic_clean)
  aic1 <- AIC(model1); r2_1 <- summary(model1)$r.squared
  model2 <- lm(Reasonable ~ Ideal, data = data_for_aic_clean)
  aic2 <- AIC(model2); r2_2 <- summary(model2)$r.squared
  model3 <- lm(Reasonable ~ Average + Ideal, data = data_for_aic_clean)
  aic3 <- AIC(model3); r2_3 <- summary(model3)$r.squared

  # Regression summary
  reg_summary <- data.frame(Model=c("Avg", "Ideal", "Both"), AIC=c(aic1, aic2, aic3), R_squared=c(r2_1, r2_2, r2_3))
  best_model_idx <- which.min(c(aic1, aic2, aic3))
  best_model_name <- c("Average", "Ideal", "Hybrid")[best_model_idx]

  cat("Best model (NO 3SD, Log+1):", best_model_name, "\n") # Keep this text output if desired

  return(list(country=country, sample_sizes=list(after_attention=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_models=list(model1=model1, model2=model2, model3=model3), regression_summary=reg_summary, aic_values=c(Model1=aic1, Model2=aic2, Model3=aic3), best_model=best_model_name))
}

# Run analysis for each country
for(country in countries) {
  country_results_median_no3sd_plusone[[country]] <- analyze_country_median_no3sd_plusone(country, Data_All_Average_Pass1, Data_All_Ideal_Pass1, Data_All_Reasonable_Pass1, question_cols)
}
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: USA ==========
## Participants after attention check: 73 72 69 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Colombia ==========
## Participants after attention check: 89 84 81 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Brazil ==========
## Participants after attention check: 43 56 52 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Germany ==========
## Participants after attention check: 71 72 72 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: India ==========
## Participants after attention check: 74 71 56 
## Best model (NO 3SD, Log+1): Ideal 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Lithuania ==========
## Participants after attention check: 76 76 77 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Italy ==========
## Participants after attention check: 73 83 75 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Poland ==========
## Participants after attention check: 90 87 90 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Spain ==========
## Participants after attention check: 73 69 72 
## Best model (NO 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (NO 3SD, Log+1) for: Netherlands ==========
## Participants after attention check: 131 130 114 
## Best model (NO 3SD, Log+1): Hybrid
# Extract AIC summary
country_aic_summary_median_no3sd_plusone <- data.frame( Country = character(), AIC_Average = numeric(), AIC_Ideal = numeric(), AIC_Hybrid = numeric(), R2_Average = numeric(), R2_Ideal = numeric(), R2_Hybrid = numeric(), Best_Model = character(), N_Avg = integer(), N_Ideal = integer(), N_Reas = integer(), stringsAsFactors = FALSE )
for(country in countries) {
  res <- country_results_median_no3sd_plusone[[country]]
  if(!is.null(res)) {
      country_aic_summary_median_no3sd_plusone <- rbind(country_aic_summary_median_no3sd_plusone, data.frame( Country = country, AIC_Average = ifelse(!is.null(res$aic_values), res$aic_values["Model1"], NA), AIC_Ideal = ifelse(!is.null(res$aic_values), res$aic_values["Model2"], NA), AIC_Hybrid = ifelse(!is.null(res$aic_values), res$aic_values["Model3"], NA), R2_Average = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[1], NA), R2_Ideal = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[2], NA), R2_Hybrid = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[3], NA), Best_Model = res$best_model, N_Avg = res$sample_sizes$after_attention[1], N_Ideal = res$sample_sizes$after_attention[2], N_Reas = res$sample_sizes$after_attention[3], stringsAsFactors = FALSE))
  }
}

# Display summary table - THIS WILL RENDER CORRECTLY IN KNITTED HTML
kable(country_aic_summary_median_no3sd_plusone, caption = "AIC/R² by Country (Median, NO 3SD, Log Plus One)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(8, background = case_when( is.na(country_aic_summary_median_no3sd_plusone$Best_Model) ~ "white", country_aic_summary_median_no3sd_plusone$Best_Model == "Hybrid" ~ "#e6f7ff", country_aic_summary_median_no3sd_plusone$Best_Model == "Average" ~ "#e6ffe6", country_aic_summary_median_no3sd_plusone$Best_Model == "Ideal" ~ "#ffe6e6", TRUE ~ "#f2f2f2" ))
AIC/R² by Country (Median, NO 3SD, Log Plus One)
Country AIC_Average AIC_Ideal AIC_Hybrid R2_Average R2_Ideal R2_Hybrid Best_Model N_Avg N_Ideal N_Reas
USA 71.341 96.354 31.173 0.809 0.592 0.947 Hybrid 73 72 69
Colombia 52.671 50.030 11.213 0.888 0.896 0.970 Hybrid 89 84 81
Brazil 65.885 91.728 42.834 0.832 0.633 0.921 Hybrid 43 56 52
Germany 60.639 89.373 29.107 0.868 0.684 0.952 Hybrid 71 72 72
India 27.616 18.286 20.283 0.718 0.787 0.787 Ideal 74 71 56
Lithuania 76.326 83.534 29.576 0.845 0.807 0.965 Hybrid 76 76 77
Italy 108.514 90.092 79.292 0.486 0.706 0.801 Hybrid 73 83 75
Poland 108.270 92.209 75.961 0.606 0.758 0.861 Hybrid 90 87 90
Spain 65.335 101.746 35.004 0.840 0.519 0.940 Hybrid 73 69 72
Netherlands 60.443 79.246 -3.602 0.873 0.776 0.983 Hybrid 131 130 114

Analysis 7 [WITH 3 SD Removed]

7a (Small constant inf values): Country Regressions (WITH 3SD Removal)

# Create storage
country_results_median_3sd_smallval <- list()

# Function to analyze median data for a single country (WITH 3SD, Small Constant Log)
analyze_country_median_3sd_smallval <- function(country, data_avg_filtered, data_ideal_filtered, data_reas_filtered, q_cols) {
  cat("\n========== Analyzing Median Data (WITH 3SD, Log Small Const) for:", country, "==========\n")

  # Filter data by country (using _Filtered data)
  country_average_final <- data_avg_filtered %>% filter(Country_name == country)
  country_ideal_final <- data_ideal_filtered %>% filter(Country_name == country)
  country_reasonable_final <- data_reas_filtered %>% filter(Country_name == country)

  n_avg <- nrow(country_average_final)
  n_ideal <- nrow(country_ideal_final)
  n_reas <- nrow(country_reasonable_final)

  cat("Participants after ALL filters:", n_avg, n_ideal, n_reas, "\n")

  if(n_avg < 1 || n_ideal < 1 || n_reas < 1) {
      cat("Insufficient participants for", country, "\n"); return(NULL)
  }

   # Columns should already be numeric from section 3, but double-check
   convert_to_numeric <- function(df, cols) { df %>% mutate(across(all_of(cols), ~ suppressWarnings(as.numeric(as.character(.))))) }
   country_average_final <- convert_to_numeric(country_average_final, q_cols)
   country_ideal_final <- convert_to_numeric(country_ideal_final, q_cols)
   country_reasonable_final <- convert_to_numeric(country_reasonable_final, q_cols)

  # Calculate medians
  medians_avg <- if(n_avg > 0) sapply(country_average_final[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_ideal <- if(n_ideal > 0) sapply(country_ideal_final[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_reas <- if(n_reas > 0) sapply(country_reasonable_final[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  names(medians_avg) <- q_cols; names(medians_ideal) <- q_cols; names(medians_reas) <- q_cols

  # Convert to log scale (Small Constant)
  log_medians_avg <- log_smallvalue(medians_avg)
  log_medians_ideal <- log_smallvalue(medians_ideal)
  log_medians_reas <- log_smallvalue(medians_reas)

  # Data frame for regression
  data_for_aic <- data.frame(Reasonable = log_medians_reas, Average = log_medians_avg, Ideal = log_medians_ideal)
  data_for_aic_clean <- data_for_aic %>% filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

  # Check for sufficient data points for regression
  if(nrow(data_for_aic_clean) < 5) {
    cat("Too few valid data points for regression analysis in", country, "\n")
    return(list(country=country, sample_sizes=list(after_all_filters=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_summary=NULL, best_model="Insufficient Data"))
  }

  # Run Regressions
  model1 <- lm(Reasonable ~ Average, data = data_for_aic_clean)
  aic1 <- AIC(model1); r2_1 <- summary(model1)$r.squared
  model2 <- lm(Reasonable ~ Ideal, data = data_for_aic_clean)
  aic2 <- AIC(model2); r2_2 <- summary(model2)$r.squared
  model3 <- lm(Reasonable ~ Average + Ideal, data = data_for_aic_clean)
  aic3 <- AIC(model3); r2_3 <- summary(model3)$r.squared

  # Regression summary
  reg_summary <- data.frame(Model=c("Avg", "Ideal", "Both"), AIC=c(aic1, aic2, aic3), R_squared=c(r2_1, r2_2, r2_3))
  best_model_idx <- which.min(c(aic1, aic2, aic3))
  best_model_name <- c("Average", "Ideal", "Hybrid")[best_model_idx]

  cat("Best model (WITH 3SD, Log Small Const):", best_model_name, "\n") # Keep this text output if desired

  return(list(country=country, sample_sizes=list(after_all_filters=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_models=list(model1=model1, model2=model2, model3=model3), regression_summary=reg_summary, aic_values=c(Model1=aic1, Model2=aic2, Model3=aic3), best_model=best_model_name))
}

# Run analysis for each country
for(country in countries) {
  country_results_median_3sd_smallval[[country]] <- analyze_country_median_3sd_smallval(country, Data_All_Average_Filtered, Data_All_Ideal_Filtered, Data_All_Reasonable_Filtered, question_cols)
}
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: USA ==========
## Participants after ALL filters: 62 54 59 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Colombia ==========
## Participants after ALL filters: 64 60 62 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Brazil ==========
## Participants after ALL filters: 30 36 33 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Germany ==========
## Participants after ALL filters: 51 50 58 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: India ==========
## Participants after ALL filters: 46 45 32 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Lithuania ==========
## Participants after ALL filters: 69 58 66 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Italy ==========
## Participants after ALL filters: 53 70 63 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Poland ==========
## Participants after ALL filters: 81 65 83 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Spain ==========
## Participants after ALL filters: 61 57 58 
## Best model (WITH 3SD, Log Small Const): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log Small Const) for: Netherlands ==========
## Participants after ALL filters: 92 113 76 
## Best model (WITH 3SD, Log Small Const): Hybrid
# Extract AIC summary
country_aic_summary_median_3sd_smallval <- data.frame( Country = character(), AIC_Average = numeric(), AIC_Ideal = numeric(), AIC_Hybrid = numeric(), R2_Average = numeric(), R2_Ideal = numeric(), R2_Hybrid = numeric(), Best_Model = character(), N_Avg = integer(), N_Ideal = integer(), N_Reas = integer(), stringsAsFactors = FALSE )
for(country in countries) {
  res <- country_results_median_3sd_smallval[[country]]
  if(!is.null(res)) {
      country_aic_summary_median_3sd_smallval <- rbind(country_aic_summary_median_3sd_smallval, data.frame( Country = country, AIC_Average = ifelse(!is.null(res$aic_values), res$aic_values["Model1"], NA), AIC_Ideal = ifelse(!is.null(res$aic_values), res$aic_values["Model2"], NA), AIC_Hybrid = ifelse(!is.null(res$aic_values), res$aic_values["Model3"], NA), R2_Average = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[1], NA), R2_Ideal = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[2], NA), R2_Hybrid = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[3], NA), Best_Model = res$best_model, N_Avg = res$sample_sizes$after_all_filters[1], N_Ideal = res$sample_sizes$after_all_filters[2], N_Reas = res$sample_sizes$after_all_filters[3], stringsAsFactors = FALSE))
  }
}

# Display summary table - THIS WILL RENDER CORRECTLY IN KNITTED HTML
kable(country_aic_summary_median_3sd_smallval, caption = "AIC/R² by Country (Median, WITH 3SD, Log Small Constant)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(8, background = case_when( is.na(country_aic_summary_median_3sd_smallval$Best_Model) ~ "white", country_aic_summary_median_3sd_smallval$Best_Model == "Hybrid" ~ "#e6f7ff", country_aic_summary_median_3sd_smallval$Best_Model == "Average" ~ "#e6ffe6", country_aic_summary_median_3sd_smallval$Best_Model == "Ideal" ~ "#ffe6e6", TRUE ~ "#f2f2f2" ))
AIC/R² by Country (Median, WITH 3SD, Log Small Constant)
Country AIC_Average AIC_Ideal AIC_Hybrid R2_Average R2_Ideal R2_Hybrid Best_Model N_Avg N_Ideal N_Reas
USA 85.844 112.697 52.644 0.743 0.420 0.912 Hybrid 62 54 59
Colombia 54.749 93.865 45.604 0.891 0.643 0.922 Hybrid 64 60 62
Brazil 75.125 117.416 64.502 0.794 0.259 0.860 Hybrid 30 36 33
Germany 76.723 116.831 50.279 0.815 0.378 0.922 Hybrid 51 50 58
India 40.269 13.551 9.429 0.511 0.782 0.819 Hybrid 46 45 32
Lithuania 86.177 121.409 58.343 0.816 0.466 0.926 Hybrid 69 58 66
Italy 151.767 135.116 134.692 0.120 0.469 0.506 Hybrid 53 70 63
Poland 153.461 141.881 136.687 0.286 0.497 0.596 Hybrid 81 65 83
Spain 76.557 121.425 52.492 0.803 0.233 0.911 Hybrid 61 57 58
Netherlands 103.249 121.498 76.198 0.730 0.531 0.888 Hybrid 92 113 76

7b (Plus one to all values): Country Regressions (WITH 3SD Removal)

# Create storage
country_results_median_3sd_plusone <- list()

# Function to analyze median data for a single country (WITH 3SD, Plus One Log)
analyze_country_median_3sd_plusone <- function(country, data_avg_filtered, data_ideal_filtered, data_reas_filtered, q_cols) {
  cat("\n========== Analyzing Median Data (WITH 3SD, Log+1) for:", country, "==========\n")

  # Filter data by country (using _Filtered data)
  country_average_final <- data_avg_filtered %>% filter(Country_name == country)
  country_ideal_final <- data_ideal_filtered %>% filter(Country_name == country)
  country_reasonable_final <- data_reas_filtered %>% filter(Country_name == country)

  n_avg <- nrow(country_average_final)
  n_ideal <- nrow(country_ideal_final)
  n_reas <- nrow(country_reasonable_final)

  cat("Participants after ALL filters:", n_avg, n_ideal, n_reas, "\n")

  if(n_avg < 1 || n_ideal < 1 || n_reas < 1) {
      cat("Insufficient participants for", country, "\n"); return(NULL)
  }

   convert_to_numeric <- function(df, cols) { df %>% mutate(across(all_of(cols), ~ suppressWarnings(as.numeric(as.character(.))))) }
   country_average_final <- convert_to_numeric(country_average_final, q_cols)
   country_ideal_final <- convert_to_numeric(country_ideal_final, q_cols)
   country_reasonable_final <- convert_to_numeric(country_reasonable_final, q_cols)

  # Calculate medians
  medians_avg <- if(n_avg > 0) sapply(country_average_final[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_ideal <- if(n_ideal > 0) sapply(country_ideal_final[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  medians_reas <- if(n_reas > 0) sapply(country_reasonable_final[, q_cols], median, na.rm = TRUE) else rep(NA, length(q_cols))
  names(medians_avg) <- q_cols; names(medians_ideal) <- q_cols; names(medians_reas) <- q_cols

  # Convert to log scale (Plus One)
  log_medians_avg <- log_plus_one(medians_avg)
  log_medians_ideal <- log_plus_one(medians_ideal)
  log_medians_reas <- log_plus_one(medians_reas)

  # Data frame for regression
  data_for_aic <- data.frame(Reasonable = log_medians_reas, Average = log_medians_avg, Ideal = log_medians_ideal)
  data_for_aic_clean <- data_for_aic %>% filter(is.finite(Reasonable) & is.finite(Average) & is.finite(Ideal))

  # Check for sufficient data points for regression
  if(nrow(data_for_aic_clean) < 5) {
    cat("Too few valid data points for regression analysis in", country, "\n")
    return(list(country=country, sample_sizes=list(after_all_filters=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_summary=NULL, best_model="Insufficient Data"))
  }

  # Run Regressions
  model1 <- lm(Reasonable ~ Average, data = data_for_aic_clean)
  aic1 <- AIC(model1); r2_1 <- summary(model1)$r.squared
  model2 <- lm(Reasonable ~ Ideal, data = data_for_aic_clean)
  aic2 <- AIC(model2); r2_2 <- summary(model2)$r.squared
  model3 <- lm(Reasonable ~ Average + Ideal, data = data_for_aic_clean)
  aic3 <- AIC(model3); r2_3 <- summary(model3)$r.squared

  # Regression summary
  reg_summary <- data.frame(Model=c("Avg", "Ideal", "Both"), AIC=c(aic1, aic2, aic3), R_squared=c(r2_1, r2_2, r2_3))
  best_model_idx <- which.min(c(aic1, aic2, aic3))
  best_model_name <- c("Average", "Ideal", "Hybrid")[best_model_idx]

  cat("Best model (WITH 3SD, Log+1):", best_model_name, "\n") # Keep this text output if desired

  return(list(country=country, sample_sizes=list(after_all_filters=c(n_avg, n_ideal, n_reas)), medians=list(average=medians_avg, ideal=medians_ideal, reasonable=medians_reas), log_medians=list(average=log_medians_avg, ideal=log_medians_ideal, reasonable=log_medians_reas), regression_models=list(model1=model1, model2=model2, model3=model3), regression_summary=reg_summary, aic_values=c(Model1=aic1, Model2=aic2, Model3=aic3), best_model=best_model_name))
}

# Run analysis for each country
for(country in countries) {
  country_results_median_3sd_plusone[[country]] <- analyze_country_median_3sd_plusone(country, Data_All_Average_Filtered, Data_All_Ideal_Filtered, Data_All_Reasonable_Filtered, question_cols)
}
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: USA ==========
## Participants after ALL filters: 62 54 59 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Colombia ==========
## Participants after ALL filters: 64 60 62 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Brazil ==========
## Participants after ALL filters: 30 36 33 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Germany ==========
## Participants after ALL filters: 51 50 58 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: India ==========
## Participants after ALL filters: 46 45 32 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Lithuania ==========
## Participants after ALL filters: 69 58 66 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Italy ==========
## Participants after ALL filters: 53 70 63 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Poland ==========
## Participants after ALL filters: 81 65 83 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Spain ==========
## Participants after ALL filters: 61 57 58 
## Best model (WITH 3SD, Log+1): Hybrid 
## 
## ========== Analyzing Median Data (WITH 3SD, Log+1) for: Netherlands ==========
## Participants after ALL filters: 92 113 76 
## Best model (WITH 3SD, Log+1): Hybrid
# Extract AIC summary
country_aic_summary_median_3sd_plusone <- data.frame( Country = character(), AIC_Average = numeric(), AIC_Ideal = numeric(), AIC_Hybrid = numeric(), R2_Average = numeric(), R2_Ideal = numeric(), R2_Hybrid = numeric(), Best_Model = character(), N_Avg = integer(), N_Ideal = integer(), N_Reas = integer(), stringsAsFactors = FALSE )
for(country in countries) {
  res <- country_results_median_3sd_plusone[[country]]
  if(!is.null(res)) {
      country_aic_summary_median_3sd_plusone <- rbind(country_aic_summary_median_3sd_plusone, data.frame( Country = country, AIC_Average = ifelse(!is.null(res$aic_values), res$aic_values["Model1"], NA), AIC_Ideal = ifelse(!is.null(res$aic_values), res$aic_values["Model2"], NA), AIC_Hybrid = ifelse(!is.null(res$aic_values), res$aic_values["Model3"], NA), R2_Average = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[1], NA), R2_Ideal = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[2], NA), R2_Hybrid = ifelse(!is.null(res$regression_summary), res$regression_summary$R_squared[3], NA), Best_Model = res$best_model, N_Avg = res$sample_sizes$after_all_filters[1], N_Ideal = res$sample_sizes$after_all_filters[2], N_Reas = res$sample_sizes$after_all_filters[3], stringsAsFactors = FALSE))
  }
}

# Display summary table - THIS WILL RENDER CORRECTLY IN KNITTED HTML
kable(country_aic_summary_median_3sd_plusone, caption = "AIC/R² by Country (Median, WITH 3SD, Log Plus One)", digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  column_spec(8, background = case_when( is.na(country_aic_summary_median_3sd_plusone$Best_Model) ~ "white", country_aic_summary_median_3sd_plusone$Best_Model == "Hybrid" ~ "#e6f7ff", country_aic_summary_median_3sd_plusone$Best_Model == "Average" ~ "#e6ffe6", country_aic_summary_median_3sd_plusone$Best_Model == "Ideal" ~ "#ffe6e6", TRUE ~ "#f2f2f2" ))
AIC/R² by Country (Median, WITH 3SD, Log Plus One)
Country AIC_Average AIC_Ideal AIC_Hybrid R2_Average R2_Ideal R2_Hybrid Best_Model N_Avg N_Ideal N_Reas
USA 79.894 95.638 45.222 0.754 0.604 0.919 Hybrid 62 54 59
Colombia 47.162 57.224 20.019 0.902 0.867 0.959 Hybrid 64 60 62
Brazil 69.234 91.498 43.311 0.809 0.626 0.918 Hybrid 30 36 33
Germany 68.432 86.987 27.282 0.835 0.710 0.955 Hybrid 51 50 58
India 29.787 -0.194 -5.491 0.513 0.804 0.843 Hybrid 46 45 32
Lithuania 82.010 79.570 27.957 0.816 0.829 0.966 Hybrid 69 58 66
Italy 113.855 92.055 86.300 0.417 0.699 0.762 Hybrid 53 70 63
Poland 106.764 91.293 75.451 0.620 0.762 0.862 Hybrid 81 65 83
Spain 68.818 102.546 39.965 0.824 0.511 0.931 Hybrid 61 57 58
Netherlands 72.615 73.724 11.639 0.826 0.820 0.974 Hybrid 92 113 76

Analysis 8 [NO 3 SD Removed]

# Create storage
country_intermediacy_results_median_no3sd <- list()

# Function to analyze median intermediacy for a single country (NO 3SD)
analyze_country_intermediacy_median_no3sd <- function(country, country_results_list) {
  cat("\n========== Analyzing Median Intermediacy (NO 3SD) for:", country, "==========\n")

  # Use the median data from Analysis 7 NO 3SD
  if(is.null(country_results_list[[country]]) || is.null(country_results_list[[country]]$medians)) {
    cat("No valid median data from Analysis 7 for", country, "\n"); return(NULL)
  }
  medians_avg <- country_results_list[[country]]$medians$average
  medians_ideal <- country_results_list[[country]]$medians$ideal
  medians_reas <- country_results_list[[country]]$medians$reasonable

  if(all(is.na(medians_avg)) || all(is.na(medians_ideal)) || all(is.na(medians_reas))){
      cat("Median vectors contain only NAs for", country, "\n"); return(NULL)
  }

  # --- Calculate vectors, counts, and p-values ---
  # Assumes is_ideal_side_median, is_average_side_median, is_both_sides_median are defined
  ideal_vec <- mapply(is_ideal_side_median, medians_avg, medians_ideal, medians_reas)
  ideal_count <- sum(ideal_vec, na.rm=TRUE); ideal_n <- sum(!is.na(ideal_vec))
  ideal_p <- NA; ideal_test <- NULL
  if(ideal_n > 0){ ideal_test <- tryCatch(binom.test(ideal_count, ideal_n, p=0.5, alt="greater"), error=function(e) NULL); if(!is.null(ideal_test)) ideal_p <- ideal_test$p.value }

  avg_vec <- mapply(is_average_side_median, medians_avg, medians_ideal, medians_reas)
  avg_count <- sum(avg_vec, na.rm=TRUE); avg_n <- sum(!is.na(avg_vec))
  avg_p <- NA; avg_test <- NULL
  if(avg_n > 0){ avg_test <- tryCatch(binom.test(avg_count, avg_n, p=0.5, alt="greater"), error=function(e) NULL); if(!is.null(avg_test)) avg_p <- avg_test$p.value }

  both_vec <- mapply(is_both_sides_median, medians_avg, medians_ideal, medians_reas)
  both_count <- sum(both_vec, na.rm=TRUE); both_n <- sum(!is.na(both_vec))
  both_p <- NA; both_test <- NULL
  if(both_n > 0){ both_test <- tryCatch(binom.test(both_count, both_n, p=1/3, alt="two.sided"), error=function(e) NULL); if(!is.null(both_test)) both_p <- both_test$p.value }

  # Create summary table
  summary_df <- data.frame(
    Test = c("On Ideal Side", "On Average Side", "Between"),
    Count = c(ideal_count, avg_count, both_count),
    N = c(ideal_n, avg_n, both_n),
    Prop = c(ideal_count/ideal_n, avg_count/avg_n, both_count/both_n),
    p_value_raw = c(ideal_p, avg_p, both_p) # Store raw p-value
  )
  summary_df$Prop <- ifelse(is.nan(summary_df$Prop), NA, summary_df$Prop)
  # Format p-value HERE using the defined helper function
  summary_df$p_value_formatted <- sapply(summary_df$p_value_raw, format_pvalue)

  # Return results including raw p-values
  return(list(country=country,
              ideal_side=list(vector=ideal_vec, count=ideal_count, n=ideal_n, prop=ideal_count/ideal_n, test=ideal_test),
              average_side=list(vector=avg_vec, count=avg_count, n=avg_n, prop=avg_count/avg_n, test=avg_test),
              both_sides=list(vector=both_vec, count=both_count, n=both_n, prop=both_count/both_n, test=both_test),
              summary_df=summary_df)) # Return the df with raw and formatted p-values
}

# Run analysis for each country (using one of the NO 3SD results lists, e.g., _plusone)
analysis_7_results_no3sd_source <- country_results_median_no3sd_plusone # Choose source

for(country in countries) {
  if (exists("analysis_7_results_no3sd_source") && !is.null(analysis_7_results_no3sd_source[[country]])) {
    country_intermediacy_results_median_no3sd[[country]] <- analyze_country_intermediacy_median_no3sd(country, analysis_7_results_no3sd_source)
  } else {
    cat("\nSkipping intermediacy (NO 3SD) for", country, "due to missing results from Analysis 7.\n")
  }
}
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: USA ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Colombia ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Brazil ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Germany ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: India ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Lithuania ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Italy ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Poland ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Spain ==========
## 
## ========== Analyzing Median Intermediacy (NO 3SD) for: Netherlands ==========
# Create summary table for median intermediacy across countries (NO 3SD)
intermediacy_by_country_median_no3sd <- data.frame(Country=character(), Ideal_Prop=numeric(), Ideal_P=numeric(), Ideal_N=integer(), Avg_Prop=numeric(), Avg_P=numeric(), Avg_N=integer(), Between_Prop=numeric(), Between_P=numeric(), Between_N=integer(), stringsAsFactors=FALSE)
for(country in countries) {
  res <- country_intermediacy_results_median_no3sd[[country]]
  if(!is.null(res) && !is.null(res$summary_df)) { # Check if summary_df exists
    ideal_p_val <- res$summary_df$p_value_raw[1]
    avg_p_val <- res$summary_df$p_value_raw[2]
    between_p_val <- res$summary_df$p_value_raw[3]

    intermediacy_by_country_median_no3sd <- rbind(intermediacy_by_country_median_no3sd, data.frame(Country=country,
                                                                                                   Ideal_Prop=res$ideal_side$prop, Ideal_P=ideal_p_val, Ideal_N=res$ideal_side$n,
                                                                                                   Avg_Prop=res$average_side$prop, Avg_P=avg_p_val, Avg_N=res$average_side$n,
                                                                                                   Between_Prop=res$both_sides$prop, Between_P=between_p_val, Between_N=res$both_sides$n,
                                                                                                   stringsAsFactors=FALSE))
  } else {
    # Add row with NAs if results for country are NULL or incomplete
     intermediacy_by_country_median_no3sd <- rbind(intermediacy_by_country_median_no3sd, data.frame(Country=country, Ideal_Prop=NA, Ideal_P=NA, Ideal_N=NA, Avg_Prop=NA, Avg_P=NA, Avg_N=NA, Between_Prop=NA, Between_P=NA, Between_N=NA, stringsAsFactors=FALSE))
  }
}

# Format and display
# Handle potential NAs introduced before formatting
intermediacy_by_country_median_no3sd <- intermediacy_by_country_median_no3sd %>%
  mutate(
    Ideal_Prop_Pct = ifelse(is.na(Ideal_Prop), "NA", scales::percent(Ideal_Prop, accuracy = 0.1)),
    Avg_Prop_Pct = ifelse(is.na(Avg_Prop), "NA", scales::percent(Avg_Prop, accuracy = 0.1)),
    Between_Prop_Pct = ifelse(is.na(Between_Prop), "NA", scales::percent(Between_Prop, accuracy = 0.1)),
    Ideal_Sig = ifelse(!is.na(Ideal_P) & Ideal_P < 0.05, "*", ""),
    Avg_Sig = ifelse(!is.na(Avg_P) & Avg_P < 0.05, "*", ""),
    Between_Sig = ifelse(!is.na(Between_P) & Between_P < 0.05, "*", "")
  )

intermediacy_by_country_median_no3sd <- intermediacy_by_country_median_no3sd %>%
  arrange(desc(Between_Prop)) # Arrange using the numeric column

# Corrected select statement (replaces the old select %>% rename)
display_intermediacy_no3sd <- intermediacy_by_country_median_no3sd %>%
  select(Country,
         `Ideal Side (%)` = Ideal_Prop_Pct, Ideal_Sig = Ideal_Sig,
         `Average Side (%)` = Avg_Prop_Pct, Average_Sig = Avg_Sig,
         `Between (%)` = Between_Prop_Pct, Between_Sig = Between_Sig
         )

kable(display_intermediacy_no3sd, caption = "Intermediacy Analysis by Country (Median, NO 3SD Removal)", align = 'lcr') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F) %>%
  add_header_above(c(" " = 1, "Ideal Side*" = 2, "Average Side*" = 2, "Between*" = 2)) %>%
  column_spec(1, bold = T)
Intermediacy Analysis by Country (Median, NO 3SD Removal)
Ideal Side*
Average Side*
Between*
Country Ideal Side (%) Ideal_Sig Average Side (%) Average_Sig Between (%) Between_Sig
Lithuania 81.8%
42.4% 100.0%
Germany 84.8%
33.3% 97.0%
Poland 72.7%
42.4% 97.0%
USA 90.9%
42.4% 93.9%
Colombia 81.8%
36.4% 93.9%
Spain 72.7%
51.5% 93.9%
Italy 72.7%
33.3% 90.9%
Netherlands 81.8%
39.4% 90.9%
Brazil 75.8%
33.3% 84.8%
India 81.8%
39.4% 30.3%
# Store results
analysis_8_results_no3sd <- list(
  country_results = country_intermediacy_results_median_no3sd,
  summary = intermediacy_by_country_median_no3sd # Contains raw P values and formatted props
)

Analysis 8 [WITH 3 SD Removed]

# Analysis 8 [WITH 3 SD Removed]

# Create storage
country_intermediacy_results_median_3sd <- list()

# Function to analyze median intermediacy for a single country (WITH 3SD)
analyze_country_intermediacy_median_3sd <- function(country, country_results_list) {
  cat("\n========== Analyzing Median Intermediacy (WITH 3SD) for:", country, "==========\n")

  # Use the median data from Analysis 7 WITH 3SD (either _smallval or _plusone is fine)
  if(is.null(country_results_list[[country]]) || is.null(country_results_list[[country]]$medians)) {
    cat("No valid median data from Analysis 7 (WITH 3SD) for", country, "\n"); return(NULL)
  }

  # Extract the raw median values
  medians_avg <- country_results_list[[country]]$medians$average
  medians_ideal <- country_results_list[[country]]$medians$ideal
  medians_reas <- country_results_list[[country]]$medians$reasonable

  if(all(is.na(medians_avg)) || all(is.na(medians_ideal)) || all(is.na(medians_reas))){
      cat("Median vectors contain only NAs for", country, "\n"); return(NULL)
  }

  # Ensure intermediacy functions are defined
  # is_ideal_side_median, is_average_side_median, is_both_sides_median

  # --- Calculate vectors, counts, and p-values ---
  ideal_vec <- mapply(is_ideal_side_median, medians_avg, medians_ideal, medians_reas)
  ideal_count <- sum(ideal_vec, na.rm=TRUE); ideal_n <- sum(!is.na(ideal_vec))
  ideal_p <- NA; ideal_test <- NULL
  if(ideal_n > 0){ ideal_test <- tryCatch(binom.test(ideal_count, ideal_n, p=0.5, alt="greater"), error=function(e) NULL); if(!is.null(ideal_test)) ideal_p <- ideal_test$p.value }

  avg_vec <- mapply(is_average_side_median, medians_avg, medians_ideal, medians_reas)
  avg_count <- sum(avg_vec, na.rm=TRUE); avg_n <- sum(!is.na(avg_vec))
  avg_p <- NA; avg_test <- NULL
  if(avg_n > 0){ avg_test <- tryCatch(binom.test(avg_count, avg_n, p=0.5, alt="greater"), error=function(e) NULL); if(!is.null(avg_test)) avg_p <- avg_test$p.value }

  both_vec <- mapply(is_both_sides_median, medians_avg, medians_ideal, medians_reas)
  both_count <- sum(both_vec, na.rm=TRUE); both_n <- sum(!is.na(both_vec))
  both_p <- NA; both_test <- NULL
  if(both_n > 0){ both_test <- tryCatch(binom.test(both_count, both_n, p=1/3, alt="two.sided"), error=function(e) NULL); if(!is.null(both_test)) both_p <- both_test$p.value }


  # Create summary table
  summary_df <- data.frame(
    Test = c("On Ideal Side", "On Average Side", "Between"),
    Count = c(ideal_count, avg_count, both_count),
    N = c(ideal_n, avg_n, both_n),
    Prop = c(ideal_count/ideal_n, avg_count/avg_n, both_count/both_n),
    p_value = c(ideal_p, avg_p, both_p)
  )
  # Handle potential NaN from 0/0
  summary_df$Prop <- ifelse(is.nan(summary_df$Prop), NA, summary_df$Prop)

  summary_df$Proportion <- ifelse(is.na(summary_df$Prop), "NA", scales::percent(summary_df$Prop, accuracy=0.1))
  summary_df$p_value <- sapply(summary_df$p_value, format_pvalue) # Ensure format_pvalue handles NA


  ## --- REMOVED THIS LINE to prevent raw HTML output during loop ---
  # cat("\nMedian Intermediacy Analysis (WITH 3SD Removed) for", country, ":\n")
  # print(kable(summary_df[,c("Test", "Count", "N", "Proportion", "p_value")]) %>% kable_styling(font_size=10))

  # Return results
  return(list(country=country, ideal_side=list(vector=ideal_vec, count=ideal_count, n=ideal_n, prop=ideal_count/ideal_n, test=ideal_test), average_side=list(vector=avg_vec, count=avg_count, n=avg_n, prop=avg_count/avg_n, test=avg_test), both_sides=list(vector=both_vec, count=both_count, n=both_n, prop=both_count/both_n, test=both_test), summary=summary_df))
}

# Run analysis for each country (using one of the WITH 3SD results lists)
for(country in countries) {
 # Ensure the source list exists and has data for the country
  if (exists("country_results_median_3sd_plusone") && !is.null(country_results_median_3sd_plusone[[country]])) {
      country_intermediacy_results_median_3sd[[country]] <- analyze_country_intermediacy_median_3sd(country, country_results_median_3sd_plusone) # Using _plusone results arbitrarily
  } else {
      cat("\nSkipping intermediacy for", country, "due to missing results from Analysis 7 (WITH 3SD).\n")
  }
}
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: USA ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Colombia ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Brazil ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Germany ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: India ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Lithuania ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Italy ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Poland ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Spain ==========
## 
## ========== Analyzing Median Intermediacy (WITH 3SD) for: Netherlands ==========
# Create summary table for median intermediacy across countries (WITH 3SD)
intermediacy_by_country_median_3sd <- data.frame(Country=character(), Ideal_Prop=numeric(), Ideal_P=numeric(), Ideal_N=integer(), Avg_Prop=numeric(), Avg_P=numeric(), Avg_N=integer(), Between_Prop=numeric(), Between_P=numeric(), Between_N=integer(), stringsAsFactors=FALSE)
for(country in countries) {
  res <- country_intermediacy_results_median_3sd[[country]]
  if(!is.null(res)) {
    # Extract p-values safely
    ideal_p_val <- if(!is.null(res$ideal_side$test) && !is.null(res$ideal_side$test$p.value)) res$ideal_side$test$p.value else NA
    avg_p_val <- if(!is.null(res$average_side$test) && !is.null(res$average_side$test$p.value)) res$average_side$test$p.value else NA
    between_p_val <- if(!is.null(res$both_sides$test) && !is.null(res$both_sides$test$p.value)) res$both_sides$test$p.value else NA

    intermediacy_by_country_median_3sd <- rbind(intermediacy_by_country_median_3sd, data.frame(Country=country,
                                                                                                Ideal_Prop=res$ideal_side$prop, Ideal_P=ideal_p_val, Ideal_N=res$ideal_side$n,
                                                                                                Avg_Prop=res$average_side$prop, Avg_P=avg_p_val, Avg_N=res$average_side$n,
                                                                                                Between_Prop=res$both_sides$prop, Between_P=between_p_val, Between_N=res$both_sides$n,
                                                                                                stringsAsFactors=FALSE))
  } else {
    intermediacy_by_country_median_3sd <- rbind(intermediacy_by_country_median_3sd, data.frame(Country=country, Ideal_Prop=NA, Ideal_P=NA, Ideal_N=0, Avg_Prop=NA, Avg_P=NA, Avg_N=0, Between_Prop=NA, Between_P=NA, Between_N=0, stringsAsFactors=FALSE))
  }
}

# Format and display - THIS WILL RENDER CORRECTLY IN KNITTED HTML
# Need to handle NAs before formatting
intermediacy_by_country_median_3sd <- intermediacy_by_country_median_3sd %>%
  mutate(
    Ideal_Prop_Pct = ifelse(is.na(Ideal_Prop), NA, scales::percent(Ideal_Prop, accuracy = 0.1)),
    Avg_Prop_Pct = ifelse(is.na(Avg_Prop), NA, scales::percent(Avg_Prop, accuracy = 0.1)),
    Between_Prop_Pct = ifelse(is.na(Between_Prop), NA, scales::percent(Between_Prop, accuracy = 0.1)),
    Ideal_Sig = ifelse(!is.na(Ideal_P) & Ideal_P < 0.05, "*", ""),
    Avg_Sig = ifelse(!is.na(Avg_P) & Avg_P < 0.05, "*", ""),
    Between_Sig = ifelse(!is.na(Between_P) & Between_P < 0.05, "*", "")
  )

# Arrange based on numeric proportion before formatting
intermediacy_by_country_median_3sd <- intermediacy_by_country_median_3sd %>%
  arrange(desc(Between_Prop)) # Arrange using the numeric column

# Corrected select statement
display_intermediacy_3sd <- intermediacy_by_country_median_3sd %>%
  select(Country,
         `Ideal Side (%)` = Ideal_Prop_Pct, Ideal_Sig = Ideal_Sig,
         `Average Side (%)` = Avg_Prop_Pct, Average_Sig = Avg_Sig,
         `Between (%)` = Between_Prop_Pct, Between_Sig = Between_Sig
         )

kable(display_intermediacy_3sd, caption = "Intermediacy Analysis by Country (Median, WITH 3SD Removal)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F) %>%
  add_header_above(c(" " = 1, "Ideal Side (%)*" = 2, "Average Side (%)*" = 2, "Between (%)*" = 2)) %>%
  column_spec(1, bold = T)
Intermediacy Analysis by Country (Median, WITH 3SD Removal)
Ideal Side (%)*
Average Side (%)*
Between (%)*
Country Ideal Side (%) Ideal_Sig Average Side (%) Average_Sig Between (%) Between_Sig
Germany 84.8%
39.4% 100.0%
Lithuania 84.8%
45.5% 100.0%
Poland 63.6% 45.5% 100.0%
USA 84.8%
42.4% 97.0%
Spain 69.7%
45.5% 90.9%
Colombia 75.8%
42.4% 87.9%
Netherlands 75.8%
42.4% 87.9%
Brazil 75.8%
36.4% 81.8%
Italy 75.8%
33.3% 81.8%
India 87.9%
39.4% 3.0%
# Store results
analysis_8_results_3sd <- list(
  country_results = country_intermediacy_results_median_3sd,
  summary = intermediacy_by_country_median_3sd # Contains raw P values and formatted proportions
)

Plots for Publication

Figure 1: Visualization of Reasonable Medians by Country and Item [NO 3 SD Removed] NEEDS REVISION

# Create a mapping for question numbers to descriptive labels
question_labels <- c(
  "q1" = "Hrs TV / day", "q2" = "Sugary drinks / wk", "q3" = "Exercise hrs / wk",
  "q4" = "Calories / day", "q5" = "Vegetable servings / mo", "q6" = "Lies told / wk",
  "q7" = "Mins doctor late", "q8" = "Books read / yr", "q9" = "Romantic partners / life",
  "q10" = "Int'l conflicts / decade", "q11" = "Dollars cheated on taxes", "q12" = "% Cheating students",
  "q13" = "Phone checks / day", "q14" = "Mins phone wait time", "q15" = "Calls to parent / mo",
  "q16" = "House cleans / mo", "q17" = "Computer crashes / mo", "q18" = "% School dropouts",
  "q20" = "% Kids bullied", "q21" = "College bro drinks / wknd", "q22" = "Days to accept contract",
  "q23" = "Wks to return product", "q24" = "Hrs to reflect on offer", "q25" = "Extra costs ($10k build)",
  "q26" = "Wks construction delay", "q27" = "Loud events near homes / yr", "q28" = "% Car profits on safety",
  "q29" = "% Medical details desired", "q30" = "Wks wait for criminal trial", "q31" = "Hourly atty fee (charity)",
  "q32" = "Hrs landlord entry notice", "q33" = "Loan interest rate (%)", "q34" = "Polluter recidivism (%)"
)

# Prepare the data for visualization
prepare_figure1_data <- function() {
  # Extract country information
  countries <- unique(Data_All$Country_name)
  
  # Initialize empty dataframe
  figure1_data <- data.frame()
  
  # Add Poland expert category if it exists
  if("Poland_Expert" %in% countries) {
    countries <- countries[countries != "Poland_Expert"]
    countries <- c(countries, "Poland_Expert")
  }
  
  # Process each country
  for(country in countries) {
    country_name <- country
    if(country == "Poland_Expert") {
      country_filter <- "Poland"
      country_name <- "Poland_Expert"
    } else {
      country_filter <- country
    }
    
    # Filter by country and condition
    if(country == "Poland_Expert") {
      # Special handling for Poland expert data if needed
      country_avg <- Data_All_Average_Pass1 %>% 
        filter(Country_name == country_filter, Expert == TRUE)
      country_ideal <- Data_All_Ideal_Pass1 %>% 
        filter(Country_name == country_filter, Expert == TRUE)
      country_reasonable <- Data_All_Reasonable_Pass1 %>% 
        filter(Country_name == country_filter, Expert == TRUE)
    } else {
      country_avg <- Data_All_Average_Pass1 %>% 
        filter(Country_name == country_filter)
      country_ideal <- Data_All_Ideal_Pass1 %>% 
        filter(Country_name == country_filter)
      country_reasonable <- Data_All_Reasonable_Pass1 %>% 
        filter(Country_name == country_filter)
    }
    
    # Calculate medians for each question by condition
    for(q in question_cols) {
      if(q %in% colnames(country_avg) && q %in% colnames(country_ideal) && q %in% colnames(country_reasonable)) {
        # Calculate medians
        median_avg <- median(as.numeric(country_avg[[q]]), na.rm = TRUE)
        median_ideal <- median(as.numeric(country_ideal[[q]]), na.rm = TRUE)
        median_reasonable <- median(as.numeric(country_reasonable[[q]]), na.rm = TRUE)
        
        # Add to dataframe
        figure1_data <- rbind(figure1_data, data.frame(
          Country = country_name,
          Item = q,
          Median_average = median_avg,
          Median_ideal = median_ideal,
          Median_reasonable = median_reasonable
        ))
      }
    }
  }
  
  return(figure1_data)
}

# Generate the data
figure1_data <- prepare_figure1_data()

# Process the data to match the paper's visualization
processed_data <- figure1_data %>%
  mutate(
    Item_label = question_labels[Item],
    Color = ifelse(Median_reasonable >= Median_average & Median_reasonable <= Median_ideal |
                   Median_reasonable <= Median_average & Median_reasonable >= Median_ideal, 
                  "navy", "firebrick"),
    Shape = case_when(
      Median_ideal < Median_average ~ 'Less',      # Downward triangle
      Median_ideal > Median_average ~ 'More',      # Upward triangle  
      Median_ideal == Median_average ~ 'Equal'     # Circle
    ),
    Ratio = log2(Median_ideal/Median_average)
  )

# Calculate the ratio for ordering items
item_ratios <- processed_data %>%
  group_by(Item) %>%
  summarize(Avg_ratio = mean(Ratio, na.rm = TRUE))

# Join with the processed data
processed_data <- processed_data %>%
  left_join(item_ratios, by = "Item")

# Define country abbreviation labels
CountryLabels = c(
  'Brazil' = 'BR', 
  'Colombia' = 'CO',
  'Germany' = 'DE',
  'India' = 'IN',
  'Italy' = 'IT',
  'Lithuania' = 'LT',
  'Netherlands' = 'NL',
  'Poland' = 'PL',
  'Poland_Expert' = 'PL*',
  'Spain' = 'ES',
  'USA' = 'US'
)

# Update country names to use abbreviations for plotting
processed_data$Country_label <- CountryLabels[processed_data$Country]

# Create the figure
figure1 <- ggplot(processed_data, 
                  aes(x = Country, y = reorder(Item_label, Avg_ratio), 
                      fill = Color, color = Color)) + 
  geom_point(aes(shape = Shape), size = 5, alpha = 0.9, stroke = 1) + 
  geom_text(aes(label = round(Median_reasonable)), 
            size = 2.5, color = "white") + 
  scale_fill_identity() +
  scale_color_identity() +
  scale_shape_manual(values = c('Equal' = 21, 'Less' = 25, 'More' = 24)) +
  scale_x_discrete(labels = CountryLabels, position = "top") +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    legend.position = "none",
    text = element_text(color = "black"),
    axis.text.x = element_text(face = "bold", color = "black", size = 10),
    axis.text.y = element_text(size = 9),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

# Display the figure
print(figure1)

# Save the figure
ggsave("Figure1_Reasonableness.jpg", figure1, dpi = 300, width = 16, height = 25, units = "cm")

Figure 1: Visualization of Reasonable Medians by Country and Item [WITH 3 SD Removed] NEEDS REVISION

# Create a mapping for question numbers to descriptive labels
question_labels <- c(
  "q1" = "Hrs TV / day", "q2" = "Sugary drinks / wk", "q3" = "Exercise hrs / wk",
  "q4" = "Calories / day", "q5" = "Vegetable servings / mo", "q6" = "Lies told / wk",
  "q7" = "Mins doctor late", "q8" = "Books read / yr", "q9" = "Romantic partners / life",
  "q10" = "Int'l conflicts / decade", "q11" = "Dollars cheated on taxes", "q12" = "% Cheating students",
  "q13" = "Phone checks / day", "q14" = "Mins phone wait time", "q15" = "Calls to parent / mo",
  "q16" = "House cleans / mo", "q17" = "Computer crashes / mo", "q18" = "% School dropouts",
  "q20" = "% Kids bullied", "q21" = "College bro drinks / wknd", "q22" = "Days to accept contract",
  "q23" = "Wks to return product", "q24" = "Hrs to reflect on offer", "q25" = "Extra costs ($10k build)",
  "q26" = "Wks construction delay", "q27" = "Loud events near homes / yr", "q28" = "% Car profits on safety",
  "q29" = "% Medical details desired", "q30" = "Wks wait for criminal trial", "q31" = "Hourly atty fee (charity)",
  "q32" = "Hrs landlord entry notice", "q33" = "Loan interest rate (%)", "q34" = "Polluter recidivism (%)"
)

# Prepare the data for visualization
prepare_figure1_data <- function() {
  # Extract country information
  countries <- unique(Data_All$Country_name)
  
  # Initialize empty dataframe
  figure1_data <- data.frame()
  
  # Add Poland expert category if it exists
  if("Poland_Expert" %in% countries) {
    countries <- countries[countries != "Poland_Expert"]
    countries <- c(countries, "Poland_Expert")
  }
  
  # Process each country
  for(country in countries) {
    country_name <- country
    if(country == "Poland_Expert") {
      country_filter <- "Poland"
      country_name <- "Poland_Expert"
    } else {
      country_filter <- country
    }
    
    # Filter by country and condition
    if(country == "Poland_Expert") {
      # Special handling for Poland expert data if needed
      country_avg <- Data_All_Average_Filtered %>% 
        filter(Country_name == country_filter, Expert == TRUE)
      country_ideal <- Data_All_Ideal_Filtered %>% 
        filter(Country_name == country_filter, Expert == TRUE)
      country_reasonable <- Data_All_Reasonable_Filtered %>% 
        filter(Country_name == country_filter, Expert == TRUE)
    } else {
      country_avg <- Data_All_Average_Filtered %>% 
        filter(Country_name == country_filter)
      country_ideal <- Data_All_Ideal_Filtered %>% 
        filter(Country_name == country_filter)
      country_reasonable <- Data_All_Reasonable_Filtered %>% 
        filter(Country_name == country_filter)
    }
    
    # Calculate medians for each question by condition
    for(q in question_cols) {
      if(q %in% colnames(country_avg) && q %in% colnames(country_ideal) && q %in% colnames(country_reasonable)) {
        # Calculate medians
        median_avg <- median(as.numeric(country_avg[[q]]), na.rm = TRUE)
        median_ideal <- median(as.numeric(country_ideal[[q]]), na.rm = TRUE)
        median_reasonable <- median(as.numeric(country_reasonable[[q]]), na.rm = TRUE)
        
        # Add to dataframe
        figure1_data <- rbind(figure1_data, data.frame(
          Country = country_name,
          Item = q,
          Median_average = median_avg,
          Median_ideal = median_ideal,
          Median_reasonable = median_reasonable
        ))
      }
    }
  }
  
  return(figure1_data)
}

# Generate the data
figure1_data <- prepare_figure1_data()

# Process the data to match the paper's visualization
processed_data <- figure1_data %>%
  mutate(
    Item_label = question_labels[Item],
    Color = ifelse(Median_reasonable >= Median_average & Median_reasonable <= Median_ideal |
                   Median_reasonable <= Median_average & Median_reasonable >= Median_ideal, 
                  "navy", "firebrick"),
    Shape = case_when(
      Median_ideal < Median_average ~ 'Less',      # Downward triangle
      Median_ideal > Median_average ~ 'More',      # Upward triangle  
      Median_ideal == Median_average ~ 'Equal'     # Circle
    ),
    Ratio = log2(Median_ideal/Median_average)
  )

# Calculate the ratio for ordering items
item_ratios <- processed_data %>%
  group_by(Item) %>%
  summarize(Avg_ratio = mean(Ratio, na.rm = TRUE))

# Join with the processed data
processed_data <- processed_data %>%
  left_join(item_ratios, by = "Item")

# Define country abbreviation labels
CountryLabels = c(
  'Brazil' = 'BR', 
  'Colombia' = 'CO',
  'Germany' = 'DE',
  'India' = 'IN',
  'Italy' = 'IT',
  'Lithuania' = 'LT',
  'Netherlands' = 'NL',
  'Poland' = 'PL',
  'Poland_Expert' = 'PL*',
  'Spain' = 'ES',
  'USA' = 'US'
)

# Update country names to use abbreviations for plotting
processed_data$Country_label <- CountryLabels[processed_data$Country]

# Create the figure
figure1 <- ggplot(processed_data, 
                  aes(x = Country, y = reorder(Item_label, Avg_ratio), 
                      fill = Color, color = Color)) + 
  geom_point(aes(shape = Shape), size = 5, alpha = 0.9, stroke = 1) + 
  geom_text(aes(label = round(Median_reasonable)), 
            size = 2.5, color = "white") + 
  scale_fill_identity() +
  scale_color_identity() +
  scale_shape_manual(values = c('Equal' = 21, 'Less' = 25, 'More' = 24)) +
  scale_x_discrete(labels = CountryLabels, position = "top") +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    legend.position = "none",
    text = element_text(color = "black"),
    axis.text.x = element_text(face = "bold", color = "black", size = 10),
    axis.text.y = element_text(size = 9),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

# Display the figure
print(figure1)

# Save the figure
ggsave("Figure1_Reasonableness.jpg", figure1, dpi = 300, width = 16, height = 25, units = "cm")